1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * An implementation of the IPoIB standard based on PSARC 2001/289.
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/conf.h>
  32 #include <sys/ddi.h>
  33 #include <sys/sunddi.h>
  34 #include <sys/modctl.h>
  35 #include <sys/stropts.h>
  36 #include <sys/stream.h>
  37 #include <sys/strsun.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/dlpi.h>
  40 #include <sys/mac_provider.h>
  41 
  42 #include <sys/pattr.h>            /* for HCK_FULLCKSUM */
  43 #include <sys/sysmacros.h>        /* for offsetof */
  44 #include <sys/disp.h>             /* for async thread pri */
  45 #include <sys/atomic.h>           /* for atomic_add*() */
  46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */
  47 #include <netinet/in.h>           /* for netinet/ip.h below */
  48 #include <netinet/ip.h>           /* for struct ip */
  49 #include <netinet/udp.h>  /* for struct udphdr */
  50 #include <inet/common.h>  /* for inet/ip.h below */
  51 #include <inet/ip.h>              /* for ipha_t */
  52 #include <inet/ip6.h>             /* for ip6_t */
  53 #include <inet/tcp.h>             /* for tcph_t */
  54 #include <netinet/icmp6.h>        /* for icmp6_t */
  55 #include <sys/callb.h>
  56 #include <sys/modhash.h>
  57 
  58 #include <sys/ib/clients/ibd/ibd.h>
  59 #include <sys/ib/mgt/sm_attr.h>   /* for SM_INIT_TYPE_* */
  60 #include <sys/note.h>
  61 #include <sys/multidata.h>
  62 
  63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */
  64 
  65 #include <sys/priv_names.h>
  66 #include <sys/dls.h>
  67 #include <sys/dld_ioc.h>
  68 #include <sys/policy.h>
  69 #include <sys/ibpart.h>
  70 #include <sys/file.h>
  71 
  72 /*
  73  * The write-up below includes details on the following:
  74  * 1. The dladm administrative model.
  75  * 2. Late HCA initialization feature.
  76  * 3. Brussels support and its implications to the current architecture.
  77  *
  78  * 1. The dladm administrative model.
  79  * ------------------------------------------
  80  * With the dladm model, ibnex will create one ibd instance per port. These
  81  * instances will be created independent of the port state.
  82  *
  83  * The ibd driver is two faceted: One side of it working as the port driver and
  84  * the other as the partition object driver.
  85  *
  86  * The port instance is a child of the HCA, and will have an entry in the devfs.
  87  * A DDI attach only happens for the port driver, and its attach is
  88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
  89  * handled in ibd_port_unattach().
  90  *
  91  * The partition object is only a registrant to the mac layer via mac_register()
  92  * and does not have an entry in the device tree. There is no DDI softstate
  93  * managed by the DDI framework for the partition objects. However, the state is
  94  * managed inside the ibd driver, and every partition object hangs off the
  95  * "ibd_objlist_head".
  96  *
  97  * The partition object first comes into existence when a user runs the
  98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
  99  * point of the partition object. The partition object goes away with the
 100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
 101  * point of the partition object.
 102  *
 103  * The create-part and delete-part subcommands result in dld ioctls that end up
 104  * calling ibd_create_parition() and ibd_delete_partition respectively.
 105  * There ioctls are registered with the dld layer in _init() via a call to
 106  * dld_ioc_register().
 107  *
 108  * The port instance by itself cannot be plumbed. It is only the partition
 109  * objects that can be plumbed and they alone participate in I/O and not the
 110  * port driver.
 111  *
 112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
 113  * display useful information. The info entry point for ibd is
 114  * ibd_get_partition_info().
 115  *
 116  * 2. Late HCA initialization feature.
 117  * ------------------------------------
 118  * As mentioned in section 1, the user creates the partition objects via
 119  * dladm(1M). It is possible that:
 120  * a) The physical port itself is down and the SM cannot be reached.
 121  * b) The PKEY specified by the used has not been created in the SM yet.
 122  * c) An IPoIB broadcast group for the specified PKEY is not present.
 123  *
 124  * In all of the above cases, complete initialization of the partition object is
 125  * not possible. However, the new model allows the creation of partition
 126  * objects even in such cases but will defer the initialization for later.
 127  * When such a partition object is plumbed, the link state will be displayed as
 128  * "down".
 129  * The driver, at this point, is listening to events that herald the
 130  * availability of resources -
 131  * i)   LINK_UP when the link becomes available
 132  * ii)  PORT_CHANGE when the PKEY has been created
 133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
 134  * created
 135  * via ibd_async_handler() for events i) and ii), and via
 136  * ibd_snet_notices_handler() for iii.
 137  * The driver handles these events (as and when they arrive) and completes the
 138  * initialization of the partition object and transitions it to a usable state.
 139  *
 140  * 3. Brussels support and its implications to the current architecture.
 141  * ---------------------------------------------------------------------
 142  * The brussels support introduces two new interfaces to the ibd driver -
 143  * ibd_m_getprop() and ibd_m_setprop().
 144  * These interfaces allow setting and retrieval of certain properties.
 145  * Some of them are public properties while most other are private properties
 146  * meant to be used by developers. Tuning the latter kind can cause
 147  * performance issues and should not be used without understanding the
 148  * implications. All properties are specific to an instance of either the
 149  * partition object or the port driver.
 150  *
 151  * The public properties are : mtu and linkmode.
 152  * mtu is a read-only property.
 153  * linkmode can take two values - UD and CM.
 154  *
 155  * Changing the linkmode requires some bookkeeping in the driver. The
 156  * capabilities need to be re-reported to the mac layer. This is done by
 157  * calling mac_capab_update().  The maxsdu is updated by calling
 158  * mac_maxsdu_update2().
 159  * The private properties retain their values across the change of linkmode.
 160  * NOTE:
 161  * - The port driver does not support any property apart from mtu.
 162  * - All other properties are only meant for the partition object.
 163  * - The properties cannot be set when an instance is plumbed. The
 164  * instance has to be unplumbed to effect any setting.
 165  */
 166 
 167 /*
 168  * Driver wide tunables
 169  *
 170  * ibd_tx_softintr
 171  * ibd_rx_softintr
 172  *     The softintr mechanism allows ibd to avoid event queue overflows if
 173  *     the receive/completion handlers are to be expensive. These are enabled
 174  *     by default.
 175  *
 176  * ibd_log_sz
 177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
 178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
 179  *
 180  */
 181 uint_t ibd_rx_softintr = 1;
 182 uint_t ibd_tx_softintr = 1;
 183 
 184 #ifdef IBD_LOGGING
 185 uint_t ibd_log_sz = 0x20000;
 186 #endif
 187 
 188 #ifdef IBD_LOGGING
 189 #define IBD_LOG_SZ                      ibd_log_sz
 190 #endif
 191 
 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
 193 #define IBD_RX_POST_CNT                 8
 194 
 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
 196 #define IBD_LOG_RX_POST                 4
 197 
 198 /* Minimum number of receive work requests driver needs to always have */
 199 #define IBD_RWQE_MIN    ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
 200 
 201 /*
 202  * LSO parameters
 203  */
 204 #define IBD_LSO_MAXLEN                  65536
 205 #define IBD_LSO_BUFSZ                   8192
 206 
 207 /*
 208  * Async operation states
 209  */
 210 #define IBD_OP_NOTSTARTED               0
 211 #define IBD_OP_ONGOING                  1
 212 #define IBD_OP_COMPLETED                2
 213 #define IBD_OP_ERRORED                  3
 214 #define IBD_OP_ROUTERED                 4
 215 
 216 /*
 217  * Start/stop in-progress flags; note that restart must always remain
 218  * the OR of start and stop flag values.
 219  */
 220 #define IBD_DRV_START_IN_PROGRESS       0x10000000
 221 #define IBD_DRV_STOP_IN_PROGRESS        0x20000000
 222 #define IBD_DRV_RESTART_IN_PROGRESS     0x30000000
 223 #define IBD_DRV_DELETE_IN_PROGRESS      IBD_DRV_RESTART_IN_PROGRESS
 224 
 225 /*
 226  * Miscellaneous constants
 227  */
 228 #define IB_MGID_IPV4_LOWGRP_MASK        0xFFFFFFFF
 229 #define IBD_DEF_MAX_SDU                 2044
 230 #define IBD_DEF_MAX_MTU                 (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
 231 #define IBD_DEF_RC_MAX_SDU              65520
 232 #define IBD_DEF_RC_MAX_MTU              (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
 233 #define IBD_DEFAULT_QKEY                0xB1B
 234 #ifdef IBD_LOGGING
 235 #define IBD_DMAX_LINE                   100
 236 #endif
 237 
 238 /*
 239  * Enumerations for link states
 240  */
 241 typedef enum {
 242         IBD_LINK_DOWN,
 243         IBD_LINK_UP,
 244         IBD_LINK_UP_ABSENT
 245 } ibd_link_op_t;
 246 
 247 /*
 248  * Driver State Pointer
 249  */
 250 void *ibd_list;
 251 
 252 /*
 253  * Driver Global Data
 254  */
 255 ibd_global_state_t ibd_gstate;
 256 
 257 /*
 258  * Partition object list
 259  */
 260 ibd_state_t     *ibd_objlist_head = NULL;
 261 kmutex_t        ibd_objlist_lock;
 262 
 263 int ibd_rc_conn_timeout = 60 * 10;      /* 10 minutes */
 264 
 265 /*
 266  * Logging
 267  */
 268 #ifdef IBD_LOGGING
 269 kmutex_t ibd_lbuf_lock;
 270 uint8_t *ibd_lbuf;
 271 uint32_t ibd_lbuf_ndx;
 272 #endif
 273 
 274 /*
 275  * Required system entry points
 276  */
 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
 279 
 280 /*
 281  * Required driver entry points for GLDv3
 282  */
 283 static int ibd_m_stat(void *, uint_t, uint64_t *);
 284 static int ibd_m_start(void *);
 285 static void ibd_m_stop(void *);
 286 static int ibd_m_promisc(void *, boolean_t);
 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
 288 static int ibd_m_unicst(void *, const uint8_t *);
 289 static mblk_t *ibd_m_tx(void *, mblk_t *);
 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
 291 
 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 293     const void *);
 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
 296     mac_prop_info_handle_t);
 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
 298     const void *);
 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
 300 
 301 /*
 302  * Private driver entry points for GLDv3
 303  */
 304 
 305 /*
 306  * Initialization
 307  */
 308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
 309 static int ibd_init_txlist(ibd_state_t *);
 310 static int ibd_init_rxlist(ibd_state_t *);
 311 static int ibd_acache_init(ibd_state_t *);
 312 #ifdef IBD_LOGGING
 313 static void ibd_log_init(void);
 314 #endif
 315 
 316 /*
 317  * Termination/cleanup
 318  */
 319 static void ibd_state_fini(ibd_state_t *);
 320 static void ibd_fini_txlist(ibd_state_t *);
 321 static void ibd_fini_rxlist(ibd_state_t *);
 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
 324 static void ibd_acache_fini(ibd_state_t *);
 325 #ifdef IBD_LOGGING
 326 static void ibd_log_fini(void);
 327 #endif
 328 
 329 /*
 330  * Allocation/acquire/map routines
 331  */
 332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
 333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
 337     uint32_t *);
 338 
 339 /*
 340  * Free/release/unmap routines
 341  */
 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
 343 static void ibd_free_tx_copybufs(ibd_state_t *);
 344 static void ibd_free_rx_copybufs(ibd_state_t *);
 345 static void ibd_free_rx_rsrcs(ibd_state_t *);
 346 static void ibd_free_tx_lsobufs(ibd_state_t *);
 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
 350 
 351 /*
 352  * Handlers/callback routines
 353  */
 354 static uint_t ibd_intr(caddr_t);
 355 static uint_t ibd_tx_recycle(caddr_t);
 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
 362 static void ibd_freemsg_cb(char *);
 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
 364     ibt_async_event_t *);
 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
 366     ibt_async_event_t *);
 367 static void ibd_snet_notices_handler(void *, ib_gid_t,
 368     ibt_subnet_event_code_t, ibt_subnet_event_t *);
 369 
 370 /*
 371  * Send/receive routines
 372  */
 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
 377 
 378 /*
 379  * Threads
 380  */
 381 static void ibd_async_work(ibd_state_t *);
 382 
 383 /*
 384  * Async tasks
 385  */
 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
 388 static void ibd_async_setprom(ibd_state_t *);
 389 static void ibd_async_unsetprom(ibd_state_t *);
 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
 392 static void ibd_async_txsched(ibd_state_t *);
 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
 394 
 395 /*
 396  * Async task helpers
 397  */
 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
 402     ipoib_mac_t *, ipoib_mac_t *);
 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
 409 static uint64_t ibd_get_portspeed(ibd_state_t *);
 410 static boolean_t ibd_async_safe(ibd_state_t *);
 411 static void ibd_async_done(ibd_state_t *);
 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
 416 
 417 /*
 418  * Helpers for attach/start routines
 419  */
 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
 421 static int ibd_record_capab(ibd_state_t *);
 422 static int ibd_get_port_details(ibd_state_t *);
 423 static int ibd_alloc_cqs(ibd_state_t *);
 424 static int ibd_setup_ud_channel(ibd_state_t *);
 425 static int ibd_start(ibd_state_t *);
 426 static int ibd_undo_start(ibd_state_t *, link_state_t);
 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
 430 static void ibd_part_unattach(ibd_state_t *state);
 431 static int ibd_port_attach(dev_info_t *);
 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
 434 static int ibd_part_busy(ibd_state_t *);
 435 
 436 /*
 437  * Miscellaneous helpers
 438  */
 439 static int ibd_sched_poll(ibd_state_t *, int, int);
 440 static void ibd_resume_transmission(ibd_state_t *);
 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
 443 static void *list_get_head(list_t *);
 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
 446 
 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
 449 
 450 #ifdef IBD_LOGGING
 451 static void ibd_log(const char *, ...);
 452 #endif
 453 
 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
 455     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
 456 
 457 /* Module Driver Info */
 458 static struct modldrv ibd_modldrv = {
 459         &mod_driverops,                     /* This one is a driver */
 460         "InfiniBand GLDv3 Driver",      /* short description */
 461         &ibd_dev_ops                        /* driver specific ops */
 462 };
 463 
 464 /* Module Linkage */
 465 static struct modlinkage ibd_modlinkage = {
 466         MODREV_1, (void *)&ibd_modldrv, NULL
 467 };
 468 
 469 /*
 470  * Module (static) info passed to IBTL during ibt_attach
 471  */
 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
 473         IBTI_V_CURR,
 474         IBT_NETWORK,
 475         ibd_async_handler,
 476         NULL,
 477         "IBPART"
 478 };
 479 
 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
 481         IBTI_V_CURR,
 482         IBT_NETWORK,
 483         ibdpd_async_handler,
 484         NULL,
 485         "IPIB"
 486 };
 487 
 488 /*
 489  * GLDv3 entry points
 490  */
 491 #define IBD_M_CALLBACK_FLAGS    \
 492         (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
 493 
 494 static mac_callbacks_t ibd_m_callbacks = {
 495         IBD_M_CALLBACK_FLAGS,
 496         ibd_m_stat,
 497         ibd_m_start,
 498         ibd_m_stop,
 499         ibd_m_promisc,
 500         ibd_m_multicst,
 501         ibd_m_unicst,
 502         ibd_m_tx,
 503         NULL,
 504         NULL,
 505         ibd_m_getcapab,
 506         NULL,
 507         NULL,
 508         ibd_m_setprop,
 509         ibd_m_getprop,
 510         ibd_m_propinfo
 511 };
 512 
 513 /* Private properties */
 514 char *ibd_priv_props[] = {
 515         "_ibd_broadcast_group",
 516         "_ibd_coalesce_completions",
 517         "_ibd_create_broadcast_group",
 518         "_ibd_hash_size",
 519         "_ibd_lso_enable",
 520         "_ibd_num_ah",
 521         "_ibd_num_lso_bufs",
 522         "_ibd_rc_enable_srq",
 523         "_ibd_rc_num_rwqe",
 524         "_ibd_rc_num_srq",
 525         "_ibd_rc_num_swqe",
 526         "_ibd_rc_rx_comp_count",
 527         "_ibd_rc_rx_comp_usec",
 528         "_ibd_rc_rx_copy_thresh",
 529         "_ibd_rc_rx_rwqe_thresh",
 530         "_ibd_rc_tx_comp_count",
 531         "_ibd_rc_tx_comp_usec",
 532         "_ibd_rc_tx_copy_thresh",
 533         "_ibd_ud_num_rwqe",
 534         "_ibd_ud_num_swqe",
 535         "_ibd_ud_rx_comp_count",
 536         "_ibd_ud_rx_comp_usec",
 537         "_ibd_ud_tx_comp_count",
 538         "_ibd_ud_tx_comp_usec",
 539         "_ibd_ud_tx_copy_thresh",
 540         NULL
 541 };
 542 
 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
 546 
 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
 548         {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
 549             ibd_create_partition, secpolicy_dl_config},
 550         {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
 551             ibd_delete_partition, secpolicy_dl_config},
 552         {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
 553             ibd_get_partition_info, NULL}
 554 };
 555 
 556 /*
 557  * Fill/clear <scope> and <p_key> in multicast/broadcast address
 558  */
 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)         \
 560 {                                                       \
 561         *(uint32_t *)((char *)(maddr) + 4) |=           \
 562             htonl((uint32_t)(scope) << 16);               \
 563         *(uint32_t *)((char *)(maddr) + 8) |=           \
 564             htonl((uint32_t)(pkey) << 16);                \
 565 }
 566 
 567 #define IBD_CLEAR_SCOPE_PKEY(maddr)                     \
 568 {                                                       \
 569         *(uint32_t *)((char *)(maddr) + 4) &=               \
 570             htonl(~((uint32_t)0xF << 16));                \
 571         *(uint32_t *)((char *)(maddr) + 8) &=               \
 572             htonl(~((uint32_t)0xFFFF << 16));             \
 573 }
 574 
 575 /*
 576  * Rudimentary debugging support
 577  */
 578 #ifdef DEBUG
 579 int ibd_debuglevel = 100;
 580 void
 581 debug_print(int l, char *fmt, ...)
 582 {
 583         va_list ap;
 584 
 585         if (l < ibd_debuglevel)
 586                 return;
 587         va_start(ap, fmt);
 588         vcmn_err(CE_CONT, fmt, ap);
 589         va_end(ap);
 590 }
 591 #endif
 592 
 593 /*
 594  * Common routine to print warning messages; adds in hca guid, port number
 595  * and pkey to be able to identify the IBA interface.
 596  */
 597 void
 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
 599 {
 600         ib_guid_t hca_guid;
 601         char ibd_print_buf[MAXNAMELEN + 256];
 602         int len;
 603         va_list ap;
 604         char part_name[MAXNAMELEN];
 605         datalink_id_t linkid = state->id_plinkid;
 606 
 607         hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
 608             0, "hca-guid", 0);
 609         (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
 610         len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
 611             "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
 612             ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
 613             (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
 614             part_name);
 615         va_start(ap, fmt);
 616         (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
 617             fmt, ap);
 618         cmn_err(CE_NOTE, "!%s", ibd_print_buf);
 619         va_end(ap);
 620 }
 621 
 622 /*
 623  * Warlock directives
 624  */
 625 
 626 /*
 627  * id_lso_lock
 628  *
 629  * state->id_lso->bkt_nfree may be accessed without a lock to
 630  * determine the threshold at which we have to ask the nw layer
 631  * to resume transmission (see ibd_resume_transmission()).
 632  */
 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
 634     ibd_state_t::id_lso))
 635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
 636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
 637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
 638 
 639 /*
 640  * id_scq_poll_lock
 641  */
 642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
 643     ibd_state_t::id_scq_poll_busy))
 644 
 645 /*
 646  * id_txpost_lock
 647  */
 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
 649     ibd_state_t::id_tx_head))
 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
 651     ibd_state_t::id_tx_busy))
 652 
 653 /*
 654  * id_acache_req_lock
 655  */
 656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 
 657     ibd_state_t::id_acache_req_cv))
 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 
 659     ibd_state_t::id_req_list))
 660 _NOTE(SCHEME_PROTECTS_DATA("atomic",
 661     ibd_acache_s::ac_ref))
 662 
 663 /*
 664  * id_ac_mutex
 665  *
 666  * This mutex is actually supposed to protect id_ah_op as well,
 667  * but this path of the code isn't clean (see update of id_ah_op
 668  * in ibd_async_acache(), immediately after the call to
 669  * ibd_async_mcache()). For now, we'll skip this check by
 670  * declaring that id_ah_op is protected by some internal scheme
 671  * that warlock isn't aware of.
 672  */
 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 674     ibd_state_t::id_ah_active))
 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 676     ibd_state_t::id_ah_free))
 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 678     ibd_state_t::id_ah_addr))
 679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
 680     ibd_state_t::id_ah_op))
 681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 682     ibd_state_t::id_ah_error))
 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 684     ibd_state_t::id_ac_hot_ace))
 685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
 686 
 687 /*
 688  * id_mc_mutex
 689  */
 690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
 691     ibd_state_t::id_mc_full))
 692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
 693     ibd_state_t::id_mc_non))
 694 
 695 /*
 696  * id_trap_lock
 697  */
 698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
 699     ibd_state_t::id_trap_cv))
 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
 701     ibd_state_t::id_trap_stop))
 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
 703     ibd_state_t::id_trap_inprog))
 704 
 705 /*
 706  * id_prom_op
 707  */
 708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
 709     ibd_state_t::id_prom_op))
 710 
 711 /*
 712  * id_sched_lock
 713  */
 714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
 715     ibd_state_t::id_sched_needed))
 716 
 717 /*
 718  * id_link_mutex
 719  */
 720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 
 721     ibd_state_t::id_link_state))
 722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
 723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
 724     ibd_state_t::id_link_speed))
 725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
 726 
 727 /*
 728  * id_tx_list.dl_mutex
 729  */
 730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
 731     ibd_state_t::id_tx_list.dl_head))
 732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
 733     ibd_state_t::id_tx_list.dl_pending_sends))
 734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
 735     ibd_state_t::id_tx_list.dl_cnt))
 736 
 737 /*
 738  * id_rx_list.dl_mutex
 739  */
 740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 741     ibd_state_t::id_rx_list.dl_bufs_outstanding))
 742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 743     ibd_state_t::id_rx_list.dl_cnt))
 744 
 745 /*
 746  * rc_timeout_lock
 747  */
 748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
 749     ibd_state_t::rc_timeout_start))
 750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
 751     ibd_state_t::rc_timeout))
 752 
 753 
 754 /*
 755  * Items protected by atomic updates
 756  */
 757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
 758     ibd_state_s::id_brd_rcv
 759     ibd_state_s::id_brd_xmt
 760     ibd_state_s::id_multi_rcv
 761     ibd_state_s::id_multi_xmt
 762     ibd_state_s::id_num_intrs
 763     ibd_state_s::id_rcv_bytes
 764     ibd_state_s::id_rcv_pkt
 765     ibd_state_s::id_rx_post_queue_index
 766     ibd_state_s::id_tx_short
 767     ibd_state_s::id_xmt_bytes
 768     ibd_state_s::id_xmt_pkt
 769     ibd_state_s::rc_rcv_trans_byte
 770     ibd_state_s::rc_rcv_trans_pkt
 771     ibd_state_s::rc_rcv_copy_byte
 772     ibd_state_s::rc_rcv_copy_pkt
 773     ibd_state_s::rc_xmt_bytes
 774     ibd_state_s::rc_xmt_small_pkt
 775     ibd_state_s::rc_xmt_fragmented_pkt
 776     ibd_state_s::rc_xmt_map_fail_pkt
 777     ibd_state_s::rc_xmt_map_succ_pkt
 778     ibd_rc_chan_s::rcq_invoking))
 779 
 780 /*
 781  * Non-mutex protection schemes for data elements. Almost all of
 782  * these are non-shared items.
 783  */
 784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
 785     callb_cpr
 786     ib_gid_s
 787     ib_header_info
 788     ibd_acache_rq
 789     ibd_acache_s::ac_mce
 790     ibd_acache_s::ac_chan
 791     ibd_mcache::mc_fullreap
 792     ibd_mcache::mc_jstate
 793     ibd_mcache::mc_req
 794     ibd_rwqe_s
 795     ibd_swqe_s
 796     ibd_wqe_s
 797     ibt_wr_ds_s::ds_va
 798     ibt_wr_lso_s
 799     ipoib_mac::ipoib_qpn
 800     mac_capab_lso_s
 801     msgb::b_next
 802     msgb::b_cont
 803     msgb::b_rptr
 804     msgb::b_wptr
 805     ibd_state_s::id_bgroup_created
 806     ibd_state_s::id_mac_state
 807     ibd_state_s::id_mtu
 808     ibd_state_s::id_ud_num_rwqe
 809     ibd_state_s::id_ud_num_swqe
 810     ibd_state_s::id_qpnum
 811     ibd_state_s::id_rcq_hdl
 812     ibd_state_s::id_rx_buf_sz
 813     ibd_state_s::id_rx_bufs
 814     ibd_state_s::id_rx_mr_hdl
 815     ibd_state_s::id_rx_wqes
 816     ibd_state_s::id_rxwcs
 817     ibd_state_s::id_rxwcs_size
 818     ibd_state_s::id_rx_nqueues
 819     ibd_state_s::id_rx_queues
 820     ibd_state_s::id_scope
 821     ibd_state_s::id_scq_hdl
 822     ibd_state_s::id_tx_buf_sz
 823     ibd_state_s::id_tx_bufs
 824     ibd_state_s::id_tx_mr_hdl
 825     ibd_state_s::id_tx_rel_list.dl_cnt
 826     ibd_state_s::id_tx_wqes
 827     ibd_state_s::id_txwcs
 828     ibd_state_s::id_txwcs_size
 829     ibd_state_s::rc_listen_hdl
 830     ibd_state_s::rc_listen_hdl_OFED_interop
 831     ibd_state_s::rc_srq_size
 832     ibd_state_s::rc_srq_rwqes
 833     ibd_state_s::rc_srq_rx_bufs
 834     ibd_state_s::rc_srq_rx_mr_hdl
 835     ibd_state_s::rc_tx_largebuf_desc_base
 836     ibd_state_s::rc_tx_mr_bufs
 837     ibd_state_s::rc_tx_mr_hdl
 838     ipha_s
 839     icmph_s
 840     ibt_path_info_s::pi_sid
 841     ibd_rc_chan_s::ace
 842     ibd_rc_chan_s::chan_hdl
 843     ibd_rc_chan_s::state
 844     ibd_rc_chan_s::chan_state
 845     ibd_rc_chan_s::is_tx_chan
 846     ibd_rc_chan_s::rcq_hdl
 847     ibd_rc_chan_s::rcq_size
 848     ibd_rc_chan_s::scq_hdl
 849     ibd_rc_chan_s::scq_size
 850     ibd_rc_chan_s::rx_bufs
 851     ibd_rc_chan_s::rx_mr_hdl
 852     ibd_rc_chan_s::rx_rwqes
 853     ibd_rc_chan_s::tx_wqes
 854     ibd_rc_chan_s::tx_mr_bufs
 855     ibd_rc_chan_s::tx_mr_hdl
 856     ibd_rc_chan_s::tx_rel_list.dl_cnt
 857     ibd_rc_chan_s::is_used
 858     ibd_rc_tx_largebuf_s::lb_buf
 859     ibd_rc_msg_hello_s
 860     ibt_cm_return_args_s))
 861 
 862 /*
 863  * ibd_rc_chan_s::next is protected by two mutexes:
 864  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
 865  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
 866  */
 867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
 868     ibd_rc_chan_s::next))
 869 
 870 /*
 871  * ibd_state_s.rc_tx_large_bufs_lock
 872  */
 873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
 874     ibd_state_s::rc_tx_largebuf_free_head))
 875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
 876     ibd_state_s::rc_tx_largebuf_nfree))
 877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
 878     ibd_rc_tx_largebuf_s::lb_next))
 879 
 880 /*
 881  * ibd_acache_s.tx_too_big_mutex
 882  */
 883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
 884     ibd_acache_s::tx_too_big_ongoing))
 885 
 886 /*
 887  * tx_wqe_list.dl_mutex
 888  */
 889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
 890     ibd_rc_chan_s::tx_wqe_list.dl_head))
 891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
 892     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
 894     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
 895 
 896 /*
 897  * ibd_state_s.rc_ace_recycle_lock
 898  */
 899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
 900     ibd_state_s::rc_ace_recycle))
 901 
 902 /*
 903  * rc_srq_rwqe_list.dl_mutex
 904  */
 905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 906     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
 907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 908     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
 909 
 910 /*
 911  * Non-mutex protection schemes for data elements. They are counters
 912  * for problem diagnosis. Don't need be protected.
 913  */
 914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
 915     ibd_state_s::rc_rcv_alloc_fail
 916     ibd_state_s::rc_rcq_err
 917     ibd_state_s::rc_ace_not_found
 918     ibd_state_s::rc_xmt_drop_too_long_pkt
 919     ibd_state_s::rc_xmt_icmp_too_long_pkt
 920     ibd_state_s::rc_xmt_reenter_too_long_pkt
 921     ibd_state_s::rc_swqe_short
 922     ibd_state_s::rc_swqe_mac_update
 923     ibd_state_s::rc_xmt_buf_short
 924     ibd_state_s::rc_xmt_buf_mac_update
 925     ibd_state_s::rc_scq_no_swqe
 926     ibd_state_s::rc_scq_no_largebuf
 927     ibd_state_s::rc_conn_succ
 928     ibd_state_s::rc_conn_fail
 929     ibd_state_s::rc_null_conn
 930     ibd_state_s::rc_no_estab_conn
 931     ibd_state_s::rc_act_close
 932     ibd_state_s::rc_pas_close
 933     ibd_state_s::rc_delay_ace_recycle
 934     ibd_state_s::rc_act_close_simultaneous
 935     ibd_state_s::rc_act_close_not_clean
 936     ibd_state_s::rc_pas_close_rcq_invoking
 937     ibd_state_s::rc_reset_cnt
 938     ibd_state_s::rc_timeout_act
 939     ibd_state_s::rc_timeout_pas
 940     ibd_state_s::rc_stop_connect))
 941 
 942 #ifdef DEBUG
 943 /*
 944  * Non-mutex protection schemes for data elements. They are counters
 945  * for problem diagnosis. Don't need be protected.
 946  */
 947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
 948     ibd_state_s::rc_rwqe_short
 949     ibd_rc_stat_s::rc_rcv_trans_byte
 950     ibd_rc_stat_s::rc_rcv_trans_pkt
 951     ibd_rc_stat_s::rc_rcv_copy_byte
 952     ibd_rc_stat_s::rc_rcv_copy_pkt
 953     ibd_rc_stat_s::rc_rcv_alloc_fail
 954     ibd_rc_stat_s::rc_rcq_err 
 955     ibd_rc_stat_s::rc_rwqe_short
 956     ibd_rc_stat_s::rc_xmt_bytes
 957     ibd_rc_stat_s::rc_xmt_small_pkt
 958     ibd_rc_stat_s::rc_xmt_fragmented_pkt
 959     ibd_rc_stat_s::rc_xmt_map_fail_pkt
 960     ibd_rc_stat_s::rc_xmt_map_succ_pkt
 961     ibd_rc_stat_s::rc_ace_not_found
 962     ibd_rc_stat_s::rc_scq_no_swqe
 963     ibd_rc_stat_s::rc_scq_no_largebuf
 964     ibd_rc_stat_s::rc_swqe_short
 965     ibd_rc_stat_s::rc_swqe_mac_update
 966     ibd_rc_stat_s::rc_xmt_buf_short
 967     ibd_rc_stat_s::rc_xmt_buf_mac_update
 968     ibd_rc_stat_s::rc_conn_succ
 969     ibd_rc_stat_s::rc_conn_fail
 970     ibd_rc_stat_s::rc_null_conn
 971     ibd_rc_stat_s::rc_no_estab_conn
 972     ibd_rc_stat_s::rc_act_close
 973     ibd_rc_stat_s::rc_pas_close
 974     ibd_rc_stat_s::rc_delay_ace_recycle
 975     ibd_rc_stat_s::rc_act_close_simultaneous
 976     ibd_rc_stat_s::rc_reset_cnt
 977     ibd_rc_stat_s::rc_timeout_act
 978     ibd_rc_stat_s::rc_timeout_pas))
 979 #endif
 980 
 981 int
 982 _init()
 983 {
 984         int status;
 985 
 986         status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
 987             PAGESIZE), 0);
 988         if (status != 0) {
 989                 DPRINT(10, "_init:failed in ddi_soft_state_init()");
 990                 return (status);
 991         }
 992 
 993         mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
 994 
 995         mac_init_ops(&ibd_dev_ops, "ibp");
 996         status = mod_install(&ibd_modlinkage);
 997         if (status != 0) {
 998                 DPRINT(10, "_init:failed in mod_install()");
 999                 ddi_soft_state_fini(&ibd_list);
1000                 mac_fini_ops(&ibd_dev_ops);
1001                 return (status);
1002         }
1003 
1004         mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1005         mutex_enter(&ibd_gstate.ig_mutex);
1006         ibd_gstate.ig_ibt_hdl = NULL;
1007         ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1008         ibd_gstate.ig_service_list = NULL;
1009         mutex_exit(&ibd_gstate.ig_mutex);
1010 
1011         if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1012             DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1013                 return (EIO);
1014         }
1015 
1016         ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1017 
1018 #ifdef IBD_LOGGING
1019         ibd_log_init();
1020 #endif
1021         return (0);
1022 }
1023 
1024 int
1025 _info(struct modinfo *modinfop)
1026 {
1027         return (mod_info(&ibd_modlinkage, modinfop));
1028 }
1029 
1030 int
1031 _fini()
1032 {
1033         int status;
1034 
1035         status = mod_remove(&ibd_modlinkage);
1036         if (status != 0)
1037                 return (status);
1038 
1039         ibt_unregister_part_attr_cb();
1040 
1041         mac_fini_ops(&ibd_dev_ops);
1042         mutex_destroy(&ibd_objlist_lock);
1043         ddi_soft_state_fini(&ibd_list);
1044         mutex_destroy(&ibd_gstate.ig_mutex);
1045 #ifdef IBD_LOGGING
1046         ibd_log_fini();
1047 #endif
1048         return (0);
1049 }
1050 
1051 /*
1052  * Convert the GID part of the mac address from network byte order
1053  * to host order.
1054  */
1055 static void
1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1057 {
1058         ib_sn_prefix_t nbopref;
1059         ib_guid_t nboguid;
1060 
1061         bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1062         bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1063         dgid->gid_prefix = b2h64(nbopref);
1064         dgid->gid_guid = b2h64(nboguid);
1065 }
1066 
1067 /*
1068  * Create the IPoIB address in network byte order from host order inputs.
1069  */
1070 static void
1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1072     ib_guid_t guid)
1073 {
1074         ib_sn_prefix_t nbopref;
1075         ib_guid_t nboguid;
1076 
1077         mac->ipoib_qpn = htonl(qpn);
1078         nbopref = h2b64(prefix);
1079         nboguid = h2b64(guid);
1080         bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1081         bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1082 }
1083 
1084 /*
1085  * Send to the appropriate all-routers group when the IBA multicast group
1086  * does not exist, based on whether the target group is v4 or v6.
1087  */
1088 static boolean_t
1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1090     ipoib_mac_t *rmac)
1091 {
1092         boolean_t retval = B_TRUE;
1093         uint32_t adjscope = state->id_scope << 16;
1094         uint32_t topword;
1095 
1096         /*
1097          * Copy the first 4 bytes in without assuming any alignment of
1098          * input mac address; this will have IPoIB signature, flags and
1099          * scope bits.
1100          */
1101         bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1102         topword = ntohl(topword);
1103 
1104         /*
1105          * Generate proper address for IPv4/v6, adding in the Pkey properly.
1106          */
1107         if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1108             (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1109                 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1110                     ((uint32_t)(state->id_pkey << 16))),
1111                     (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1112         else
1113                 /*
1114                  * Does not have proper bits in the mgid address.
1115                  */
1116                 retval = B_FALSE;
1117 
1118         return (retval);
1119 }
1120 
1121 /*
1122  * Membership states for different mcg's are tracked by two lists:
1123  * the "non" list is used for promiscuous mode, when all mcg traffic
1124  * needs to be inspected. This type of membership is never used for
1125  * transmission, so there can not be an AH in the active list
1126  * corresponding to a member in this list. This list does not need
1127  * any protection, since all operations are performed by the async
1128  * thread.
1129  *
1130  * "Full" and "SendOnly" membership is tracked using a single list,
1131  * the "full" list. This is because this single list can then be
1132  * searched during transmit to a multicast group (if an AH for the
1133  * mcg is not found in the active list), since at least one type
1134  * of membership must be present before initiating the transmit.
1135  * This list is also emptied during driver detach, since sendonly
1136  * membership acquired during transmit is dropped at detach time
1137  * along with ipv4 broadcast full membership. Insert/deletes to
1138  * this list are done only by the async thread, but it is also
1139  * searched in program context (see multicast disable case), thus
1140  * the id_mc_mutex protects the list. The driver detach path also
1141  * deconstructs the "full" list, but it ensures that the async
1142  * thread will not be accessing the list (by blocking out mcg
1143  * trap handling and making sure no more Tx reaping will happen).
1144  *
1145  * Currently, an IBA attach is done in the SendOnly case too,
1146  * although this is not required.
1147  */
1148 #define IBD_MCACHE_INSERT_FULL(state, mce) \
1149         list_insert_head(&state->id_mc_full, mce)
1150 #define IBD_MCACHE_INSERT_NON(state, mce) \
1151         list_insert_head(&state->id_mc_non, mce)
1152 #define IBD_MCACHE_FIND_FULL(state, mgid) \
1153         ibd_mcache_find(mgid, &state->id_mc_full)
1154 #define IBD_MCACHE_FIND_NON(state, mgid) \
1155         ibd_mcache_find(mgid, &state->id_mc_non)
1156 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \
1157         list_remove(&state->id_mc_full, mce)
1158 #define IBD_MCACHE_PULLOUT_NON(state, mce) \
1159         list_remove(&state->id_mc_non, mce)
1160 
1161 static void *
1162 list_get_head(list_t *list)
1163 {
1164         list_node_t *lhead = list_head(list);
1165 
1166         if (lhead != NULL)
1167                 list_remove(list, lhead);
1168         return (lhead);
1169 }
1170 
1171 /*
1172  * This is always guaranteed to be able to queue the work.
1173  */
1174 void
1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1176 {
1177         /* Initialize request */
1178         DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1179         ptr->rq_op = op;
1180 
1181         /*
1182          * Queue provided slot onto request pool.
1183          */
1184         mutex_enter(&state->id_acache_req_lock);
1185         list_insert_tail(&state->id_req_list, ptr);
1186 
1187         /* Go, fetch, async thread */
1188         cv_signal(&state->id_acache_req_cv);
1189         mutex_exit(&state->id_acache_req_lock);
1190 }
1191 
1192 /*
1193  * Main body of the per interface async thread.
1194  */
1195 static void
1196 ibd_async_work(ibd_state_t *state)
1197 {
1198         ibd_req_t *ptr;
1199         callb_cpr_t cprinfo;
1200 
1201         mutex_enter(&state->id_acache_req_lock);
1202         CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1203             callb_generic_cpr, "ibd_async_work");
1204 
1205         for (;;) {
1206                 ptr = list_get_head(&state->id_req_list);
1207                 if (ptr != NULL) {
1208                         mutex_exit(&state->id_acache_req_lock);
1209 
1210                         /*
1211                          * If we are in late hca initialization mode, do not
1212                          * process any other async request other than TRAP. TRAP
1213                          * is used for indicating creation of a broadcast group;
1214                          * in which case, we need to join/create the group.
1215                          */
1216                         if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1217                             (ptr->rq_op != IBD_ASYNC_TRAP)) {
1218                                 goto free_req_and_continue;
1219                         }
1220 
1221                         /*
1222                          * Once we have done the operation, there is no
1223                          * guarantee the request slot is going to be valid,
1224                          * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1225                          * TRAP).
1226                          *
1227                          * Perform the request.
1228                          */
1229                         switch (ptr->rq_op) {
1230                                 case IBD_ASYNC_GETAH:
1231                                         ibd_async_acache(state, &ptr->rq_mac);
1232                                         break;
1233                                 case IBD_ASYNC_JOIN:
1234                                 case IBD_ASYNC_LEAVE:
1235                                         ibd_async_multicast(state,
1236                                             ptr->rq_gid, ptr->rq_op);
1237                                         break;
1238                                 case IBD_ASYNC_PROMON:
1239                                         ibd_async_setprom(state);
1240                                         break;
1241                                 case IBD_ASYNC_PROMOFF:
1242                                         ibd_async_unsetprom(state);
1243                                         break;
1244                                 case IBD_ASYNC_REAP:
1245                                         ibd_async_reap_group(state,
1246                                             ptr->rq_ptr, ptr->rq_gid,
1247                                             IB_MC_JSTATE_FULL);
1248                                         /*
1249                                          * the req buf contains in mce
1250                                          * structure, so we do not need
1251                                          * to free it here.
1252                                          */
1253                                         ptr = NULL;
1254                                         break;
1255                                 case IBD_ASYNC_TRAP:
1256                                         ibd_async_trap(state, ptr);
1257                                         break;
1258                                 case IBD_ASYNC_SCHED:
1259                                         ibd_async_txsched(state);
1260                                         break;
1261                                 case IBD_ASYNC_LINK:
1262                                         ibd_async_link(state, ptr);
1263                                         break;
1264                                 case IBD_ASYNC_EXIT:
1265                                         mutex_enter(&state->id_acache_req_lock);
1266 #ifndef __lock_lint
1267                                         CALLB_CPR_EXIT(&cprinfo);
1268 #else
1269                                         mutex_exit(&state->id_acache_req_lock);
1270 #endif
1271                                         return;
1272                                 case IBD_ASYNC_RC_TOO_BIG:
1273                                         ibd_async_rc_process_too_big(state,
1274                                             ptr);
1275                                         break;
1276                                 case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1277                                         ibd_async_rc_close_act_chan(state, ptr);
1278                                         break;
1279                                 case IBD_ASYNC_RC_RECYCLE_ACE:
1280                                         ibd_async_rc_recycle_ace(state, ptr);
1281                                         break;
1282                                 case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
1283                                         (void) ibd_rc_pas_close(ptr->rq_ptr,
1284                                             B_TRUE, B_TRUE);
1285                                         break;
1286                         }
1287 free_req_and_continue:
1288                         if (ptr != NULL)
1289                                 kmem_cache_free(state->id_req_kmc, ptr);
1290 
1291                         mutex_enter(&state->id_acache_req_lock);
1292                 } else {
1293 #ifndef __lock_lint
1294                         /*
1295                          * Nothing to do: wait till new request arrives.
1296                          */
1297                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298                         cv_wait(&state->id_acache_req_cv,
1299                             &state->id_acache_req_lock);
1300                         CALLB_CPR_SAFE_END(&cprinfo,
1301                             &state->id_acache_req_lock);
1302 #endif
1303                 }
1304         }
1305 
1306         /*NOTREACHED*/
1307         _NOTE(NOT_REACHED)
1308 }
1309 
1310 /*
1311  * Return when it is safe to queue requests to the async daemon; primarily
1312  * for subnet trap and async event handling. Disallow requests before the
1313  * daemon is created, and when interface deinitilization starts.
1314  */
1315 static boolean_t
1316 ibd_async_safe(ibd_state_t *state)
1317 {
1318         mutex_enter(&state->id_trap_lock);
1319         if (state->id_trap_stop) {
1320                 mutex_exit(&state->id_trap_lock);
1321                 return (B_FALSE);
1322         }
1323         state->id_trap_inprog++;
1324         mutex_exit(&state->id_trap_lock);
1325         return (B_TRUE);
1326 }
1327 
1328 /*
1329  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1330  * trap or event handling to complete to kill the async thread and deconstruct
1331  * the mcg/ace list.
1332  */
1333 static void
1334 ibd_async_done(ibd_state_t *state)
1335 {
1336         mutex_enter(&state->id_trap_lock);
1337         if (--state->id_trap_inprog == 0)
1338                 cv_signal(&state->id_trap_cv);
1339         mutex_exit(&state->id_trap_lock);
1340 }
1341 
1342 /*
1343  * Hash functions:
1344  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1345  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1346  * These operate on mac addresses input into ibd_send, but there is no
1347  * guarantee on the alignment of the ipoib_mac_t structure.
1348  */
1349 /*ARGSUSED*/
1350 static uint_t
1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1352 {
1353         ulong_t ptraddr = (ulong_t)key;
1354         uint_t hval;
1355 
1356         /*
1357          * If the input address is 4 byte aligned, we can just dereference
1358          * it. This is most common, since IP will send in a 4 byte aligned
1359          * IP header, which implies the 24 byte IPoIB psuedo header will be
1360          * 4 byte aligned too.
1361          */
1362         if ((ptraddr & 3) == 0)
1363                 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1364 
1365         bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1366         return (hval);
1367 }
1368 
1369 static int
1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1371 {
1372         if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1373                 return (0);
1374         else
1375                 return (1);
1376 }
1377 
1378 /*
1379  * Initialize all the per interface caches and lists; AH cache,
1380  * MCG list etc.
1381  */
1382 static int
1383 ibd_acache_init(ibd_state_t *state)
1384 {
1385         ibd_ace_t *ce;
1386         int i;
1387 
1388         mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1389         mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1390         mutex_enter(&state->id_ac_mutex);
1391         list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1392             offsetof(ibd_ace_t, ac_list));
1393         list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1394             offsetof(ibd_ace_t, ac_list));
1395         state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1396             state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1397             ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1398         list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1399             offsetof(ibd_mce_t, mc_list));
1400         list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1401             offsetof(ibd_mce_t, mc_list));
1402         state->id_ac_hot_ace = NULL;
1403 
1404         state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1405             state->id_num_ah, KM_SLEEP);
1406         for (i = 0; i < state->id_num_ah; i++, ce++) {
1407                 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1408                     state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1409                         mutex_exit(&state->id_ac_mutex);
1410                         ibd_acache_fini(state);
1411                         return (DDI_FAILURE);
1412                 } else {
1413                         CLEAR_REFCYCLE(ce);
1414                         ce->ac_mce = NULL;
1415                         mutex_init(&ce->tx_too_big_mutex, NULL,
1416                             MUTEX_DRIVER, NULL);
1417                         IBD_ACACHE_INSERT_FREE(state, ce);
1418                 }
1419         }
1420         mutex_exit(&state->id_ac_mutex);
1421         return (DDI_SUCCESS);
1422 }
1423 
1424 static void
1425 ibd_acache_fini(ibd_state_t *state)
1426 {
1427         ibd_ace_t *ptr;
1428 
1429         mutex_enter(&state->id_ac_mutex);
1430 
1431         while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1432                 ASSERT(GET_REF(ptr) == 0);
1433                 mutex_destroy(&ptr->tx_too_big_mutex);
1434                 (void) ibt_free_ud_dest(ptr->ac_dest);
1435         }
1436 
1437         while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1438                 ASSERT(GET_REF(ptr) == 0);
1439                 mutex_destroy(&ptr->tx_too_big_mutex);
1440                 (void) ibt_free_ud_dest(ptr->ac_dest);
1441         }
1442 
1443         list_destroy(&state->id_ah_free);
1444         list_destroy(&state->id_ah_active);
1445         list_destroy(&state->id_mc_full);
1446         list_destroy(&state->id_mc_non);
1447         kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1448         mutex_exit(&state->id_ac_mutex);
1449         mutex_destroy(&state->id_ac_mutex);
1450         mutex_destroy(&state->id_mc_mutex);
1451 }
1452 
1453 /*
1454  * Search AH active hash list for a cached path to input destination.
1455  * If we are "just looking", hold == F. When we are in the Tx path,
1456  * we set hold == T to grab a reference on the AH so that it can not
1457  * be recycled to a new destination while the Tx request is posted.
1458  */
1459 ibd_ace_t *
1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1461 {
1462         ibd_ace_t *ptr;
1463 
1464         ASSERT(mutex_owned(&state->id_ac_mutex));
1465 
1466         /*
1467          * Do hash search.
1468          */
1469         if (mod_hash_find(state->id_ah_active_hash,
1470             (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1471                 if (hold)
1472                         INC_REF(ptr, num);
1473                 return (ptr);
1474         }
1475         return (NULL);
1476 }
1477 
1478 /*
1479  * This is called by the tx side; if an initialized AH is found in
1480  * the active list, it is locked down and can be used; if no entry
1481  * is found, an async request is queued to do path resolution.
1482  */
1483 static ibd_ace_t *
1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1485 {
1486         ibd_ace_t *ptr;
1487         ibd_req_t *req;
1488 
1489         /*
1490          * Only attempt to print when we can; in the mdt pattr case, the
1491          * address is not aligned properly.
1492          */
1493         if (((ulong_t)mac & 3) == 0) {
1494                 DPRINT(4,
1495                     "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1496                     htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1497                     htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1498                     htonl(mac->ipoib_gidsuff[1]));
1499         }
1500 
1501         mutex_enter(&state->id_ac_mutex);
1502 
1503         if (((ptr = state->id_ac_hot_ace) != NULL) &&
1504             (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1505                 INC_REF(ptr, numwqe);
1506                 mutex_exit(&state->id_ac_mutex);
1507                 return (ptr);
1508         }
1509         if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1510                 state->id_ac_hot_ace = ptr;
1511                 mutex_exit(&state->id_ac_mutex);
1512                 return (ptr);
1513         }
1514 
1515         /*
1516          * Implementation of a single outstanding async request; if
1517          * the operation is not started yet, queue a request and move
1518          * to ongoing state. Remember in id_ah_addr for which address
1519          * we are queueing the request, in case we need to flag an error;
1520          * Any further requests, for the same or different address, until
1521          * the operation completes, is sent back to GLDv3 to be retried.
1522          * The async thread will update id_ah_op with an error indication
1523          * or will set it to indicate the next look up can start; either
1524          * way, it will mac_tx_update() so that all blocked requests come
1525          * back here.
1526          */
1527         *err = EAGAIN;
1528         if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1529                 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1530                 if (req != NULL) {
1531                         /*
1532                          * We did not even find the entry; queue a request
1533                          * for it.
1534                          */
1535                         bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1536                         state->id_ah_op = IBD_OP_ONGOING;
1537                         ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1538                         bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1539                 }
1540         } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1541             (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1542                 /*
1543                  * Check the status of the pathrecord lookup request
1544                  * we had queued before.
1545                  */
1546                 if (state->id_ah_op == IBD_OP_ERRORED) {
1547                         *err = EFAULT;
1548                         state->id_ah_error++;
1549                 } else {
1550                         /*
1551                          * IBD_OP_ROUTERED case: We need to send to the
1552                          * all-router MCG. If we can find the AH for
1553                          * the mcg, the Tx will be attempted. If we
1554                          * do not find the AH, we return NORESOURCES
1555                          * to retry.
1556                          */
1557                         ipoib_mac_t routermac;
1558 
1559                         (void) ibd_get_allroutergroup(state, mac, &routermac);
1560                         ptr = ibd_acache_find(state, &routermac, B_TRUE,
1561                             numwqe);
1562                 }
1563                 state->id_ah_op = IBD_OP_NOTSTARTED;
1564         } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1565             (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1566                 /*
1567                  * This case can happen when we get a higher band
1568                  * packet. The easiest way is to reset the state machine
1569                  * to accommodate the higher priority packet.
1570                  */
1571                 state->id_ah_op = IBD_OP_NOTSTARTED;
1572         }
1573         mutex_exit(&state->id_ac_mutex);
1574 
1575         return (ptr);
1576 }
1577 
1578 /*
1579  * Grab a not-currently-in-use AH/PathRecord from the active
1580  * list to recycle to a new destination. Only the async thread
1581  * executes this code.
1582  */
1583 static ibd_ace_t *
1584 ibd_acache_get_unref(ibd_state_t *state)
1585 {
1586         ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1587         boolean_t try_rc_chan_recycle = B_FALSE;
1588 
1589         ASSERT(mutex_owned(&state->id_ac_mutex));
1590 
1591         /*
1592          * Do plain linear search.
1593          */
1594         while (ptr != NULL) {
1595                 /*
1596                  * Note that it is possible that the "cycle" bit
1597                  * is set on the AH w/o any reference count. The
1598                  * mcg must have been deleted, and the tx cleanup
1599                  * just decremented the reference count to 0, but
1600                  * hasn't gotten around to grabbing the id_ac_mutex
1601                  * to move the AH into the free list.
1602                  */
1603                 if (GET_REF(ptr) == 0) {
1604                         if (ptr->ac_chan != NULL) {
1605                                 ASSERT(state->id_enable_rc == B_TRUE);
1606                                 if (!try_rc_chan_recycle) {
1607                                         try_rc_chan_recycle = B_TRUE;
1608                                         ibd_rc_signal_ace_recycle(state, ptr);
1609                                 }
1610                         } else {
1611                                 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1612                                 break;
1613                         }
1614                 }
1615                 ptr = list_prev(&state->id_ah_active, ptr);
1616         }
1617         return (ptr);
1618 }
1619 
1620 /*
1621  * Invoked to clean up AH from active list in case of multicast
1622  * disable and to handle sendonly memberships during mcg traps.
1623  * And for port up processing for multicast and unicast AHs.
1624  * Normally, the AH is taken off the active list, and put into
1625  * the free list to be recycled for a new destination. In case
1626  * Tx requests on the AH have not completed yet, the AH is marked
1627  * for reaping (which will put the AH on the free list) once the Tx's
1628  * complete; in this case, depending on the "force" input, we take
1629  * out the AH from the active list right now, or leave it also for
1630  * the reap operation. Returns TRUE if the AH is taken off the active
1631  * list (and either put into the free list right now, or arranged for
1632  * later), FALSE otherwise.
1633  */
1634 boolean_t
1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1636 {
1637         ibd_ace_t *acactive;
1638         boolean_t ret = B_TRUE;
1639 
1640         ASSERT(mutex_owned(&state->id_ac_mutex));
1641 
1642         if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1643 
1644                 /*
1645                  * Note that the AH might already have the cycle bit set
1646                  * on it; this might happen if sequences of multicast
1647                  * enables and disables are coming so fast, that posted
1648                  * Tx's to the mcg have not completed yet, and the cycle
1649                  * bit is set successively by each multicast disable.
1650                  */
1651                 if (SET_CYCLE_IF_REF(acactive)) {
1652                         if (!force) {
1653                                 /*
1654                                  * The ace is kept on the active list, further
1655                                  * Tx's can still grab a reference on it; the
1656                                  * ace is reaped when all pending Tx's
1657                                  * referencing the AH complete.
1658                                  */
1659                                 ret = B_FALSE;
1660                         } else {
1661                                 /*
1662                                  * In the mcg trap case, we always pull the
1663                                  * AH from the active list. And also the port
1664                                  * up multi/unicast case.
1665                                  */
1666                                 ASSERT(acactive->ac_chan == NULL);
1667                                 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1668                                 acactive->ac_mce = NULL;
1669                         }
1670                 } else {
1671                         /*
1672                          * Determined the ref count is 0, thus reclaim
1673                          * immediately after pulling out the ace from
1674                          * the active list.
1675                          */
1676                         ASSERT(acactive->ac_chan == NULL);
1677                         IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1678                         acactive->ac_mce = NULL;
1679                         IBD_ACACHE_INSERT_FREE(state, acactive);
1680                 }
1681 
1682         }
1683         return (ret);
1684 }
1685 
1686 /*
1687  * Helper function for async path record lookup. If we are trying to
1688  * Tx to a MCG, check our membership, possibly trying to join the
1689  * group if required. If that fails, try to send the packet to the
1690  * all router group (indicated by the redirect output), pointing
1691  * the input mac address to the router mcg address.
1692  */
1693 static ibd_mce_t *
1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1695 {
1696         ib_gid_t mgid;
1697         ibd_mce_t *mce;
1698         ipoib_mac_t routermac;
1699 
1700         *redirect = B_FALSE;
1701         ibd_n2h_gid(mac, &mgid);
1702 
1703         /*
1704          * Check the FullMember+SendOnlyNonMember list.
1705          * Since we are the only one who manipulates the
1706          * id_mc_full list, no locks are needed.
1707          */
1708         mce = IBD_MCACHE_FIND_FULL(state, mgid);
1709         if (mce != NULL) {
1710                 DPRINT(4, "ibd_async_mcache : already joined to group");
1711                 return (mce);
1712         }
1713 
1714         /*
1715          * Not found; try to join(SendOnlyNonMember) and attach.
1716          */
1717         DPRINT(4, "ibd_async_mcache : not joined to group");
1718         if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1719             NULL) {
1720                 DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1721                 return (mce);
1722         }
1723 
1724         /*
1725          * MCGroup not present; try to join the all-router group. If
1726          * any of the following steps succeed, we will be redirecting
1727          * to the all router group.
1728          */
1729         DPRINT(4, "ibd_async_mcache : nonmem join failed");
1730         if (!ibd_get_allroutergroup(state, mac, &routermac))
1731                 return (NULL);
1732         *redirect = B_TRUE;
1733         ibd_n2h_gid(&routermac, &mgid);
1734         bcopy(&routermac, mac, IPOIB_ADDRL);
1735         DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1736             mgid.gid_prefix, mgid.gid_guid);
1737 
1738         /*
1739          * Are we already joined to the router group?
1740          */
1741         if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1742                 DPRINT(4, "ibd_async_mcache : using already joined router"
1743                     "group\n");
1744                 return (mce);
1745         }
1746 
1747         /*
1748          * Can we join(SendOnlyNonMember) the router group?
1749          */
1750         DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1751         if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1752             NULL) {
1753                 DPRINT(4, "ibd_async_mcache : joined to router grp");
1754                 return (mce);
1755         }
1756 
1757         return (NULL);
1758 }
1759 
1760 /*
1761  * Async path record lookup code.
1762  */
1763 static void
1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1765 {
1766         ibd_ace_t *ce;
1767         ibd_mce_t *mce = NULL;
1768         ibt_path_attr_t path_attr;
1769         ibt_path_info_t path_info;
1770         ib_gid_t destgid;
1771         char ret = IBD_OP_NOTSTARTED;
1772 
1773         DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1774             htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1775             htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1776             htonl(mac->ipoib_gidsuff[1]));
1777 
1778         /*
1779          * Check whether we are trying to transmit to a MCG.
1780          * In that case, we need to make sure we are a member of
1781          * the MCG.
1782          */
1783         if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1784                 boolean_t redirected;
1785 
1786                 /*
1787                  * If we can not find or join the group or even
1788                  * redirect, error out.
1789                  */
1790                 if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1791                     NULL) {
1792                         state->id_ah_op = IBD_OP_ERRORED;
1793                         return;
1794                 }
1795 
1796                 /*
1797                  * If we got redirected, we need to determine whether
1798                  * the AH for the new mcg is in the cache already, and
1799                  * not pull it in then; otherwise proceed to get the
1800                  * path for the new mcg. There is no guarantee that
1801                  * if the AH is currently in the cache, it will still be
1802                  * there when we look in ibd_acache_lookup(), but that's
1803                  * okay, we will come back here.
1804                  */
1805                 if (redirected) {
1806                         ret = IBD_OP_ROUTERED;
1807                         DPRINT(4, "ibd_async_acache :  redirected to "
1808                             "%08X:%08X:%08X:%08X:%08X",
1809                             htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1810                             htonl(mac->ipoib_gidpref[1]),
1811                             htonl(mac->ipoib_gidsuff[0]),
1812                             htonl(mac->ipoib_gidsuff[1]));
1813 
1814                         mutex_enter(&state->id_ac_mutex);
1815                         if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1816                                 state->id_ah_op = IBD_OP_ROUTERED;
1817                                 mutex_exit(&state->id_ac_mutex);
1818                                 DPRINT(4, "ibd_async_acache : router AH found");
1819                                 return;
1820                         }
1821                         mutex_exit(&state->id_ac_mutex);
1822                 }
1823         }
1824 
1825         /*
1826          * Get an AH from the free list.
1827          */
1828         mutex_enter(&state->id_ac_mutex);
1829         if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1830                 /*
1831                  * No free ones; try to grab an unreferenced active
1832                  * one. Maybe we need to make the active list LRU,
1833                  * but that will create more work for Tx callbacks.
1834                  * Is there a way of not having to pull out the
1835                  * entry from the active list, but just indicate it
1836                  * is being recycled? Yes, but that creates one more
1837                  * check in the fast lookup path.
1838                  */
1839                 if ((ce = ibd_acache_get_unref(state)) == NULL) {
1840                         /*
1841                          * Pretty serious shortage now.
1842                          */
1843                         state->id_ah_op = IBD_OP_NOTSTARTED;
1844                         mutex_exit(&state->id_ac_mutex);
1845                         DPRINT(10, "ibd_async_acache : failed to find AH "
1846                             "slot\n");
1847                         return;
1848                 }
1849                 /*
1850                  * We could check whether ac_mce points to a SendOnly
1851                  * member and drop that membership now. Or do it lazily
1852                  * at detach time.
1853                  */
1854                 ce->ac_mce = NULL;
1855         }
1856         mutex_exit(&state->id_ac_mutex);
1857         ASSERT(ce->ac_mce == NULL);
1858 
1859         /*
1860          * Update the entry.
1861          */
1862         bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1863 
1864         bzero(&path_info, sizeof (path_info));
1865         bzero(&path_attr, sizeof (ibt_path_attr_t));
1866         path_attr.pa_sgid = state->id_sgid;
1867         path_attr.pa_num_dgids = 1;
1868         ibd_n2h_gid(&ce->ac_mac, &destgid);
1869         path_attr.pa_dgids = &destgid;
1870         path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1871         path_attr.pa_pkey = state->id_pkey;
1872         if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1873             &path_info, NULL) != IBT_SUCCESS) {
1874                 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1875                 goto error;
1876         }
1877         if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1878             ntohl(ce->ac_mac.ipoib_qpn),
1879             &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1880                 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1881                 goto error;
1882         }
1883 
1884         /*
1885          * mce is set whenever an AH is being associated with a
1886          * MCG; this will come in handy when we leave the MCG. The
1887          * lock protects Tx fastpath from scanning the active list.
1888          */
1889         if (mce != NULL)
1890                 ce->ac_mce = mce;
1891 
1892         /*
1893          * initiate a RC mode connection for unicast address
1894          */
1895         if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1896             (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1897                 ASSERT(ce->ac_chan == NULL);
1898                 DPRINT(10, "ibd_async_acache: call "
1899                     "ibd_rc_try_connect(ace=%p)", ce);
1900                 ibd_rc_try_connect(state, ce, &path_info);
1901                 if (ce->ac_chan == NULL) {
1902                         DPRINT(10, "ibd_async_acache: fail to setup RC"
1903                             " channel");
1904                         state->rc_conn_fail++;
1905                         goto error;
1906                 }
1907         }
1908 
1909         mutex_enter(&state->id_ac_mutex);
1910         IBD_ACACHE_INSERT_ACTIVE(state, ce);
1911         state->id_ah_op = ret;
1912         mutex_exit(&state->id_ac_mutex);
1913         return;
1914 error:
1915         /*
1916          * We might want to drop SendOnly membership here if we
1917          * joined above. The lock protects Tx callbacks inserting
1918          * into the free list.
1919          */
1920         mutex_enter(&state->id_ac_mutex);
1921         state->id_ah_op = IBD_OP_ERRORED;
1922         IBD_ACACHE_INSERT_FREE(state, ce);
1923         mutex_exit(&state->id_ac_mutex);
1924 }
1925 
1926 /*
1927  * While restoring port's presence on the subnet on a port up, it is possible
1928  * that the port goes down again.
1929  */
1930 static void
1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1932 {
1933         ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1934         link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1935             LINK_STATE_UP;
1936         ibd_mce_t *mce, *pmce;
1937         ibd_ace_t *ace, *pace;
1938 
1939         DPRINT(10, "ibd_async_link(): %d", opcode);
1940 
1941         /*
1942          * On a link up, revalidate the link speed/width. No point doing
1943          * this on a link down, since we will be unable to do SA operations,
1944          * defaulting to the lowest speed. Also notice that we update our
1945          * notion of speed before calling mac_link_update(), which will do
1946          * necessary higher level notifications for speed changes.
1947          */
1948         if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1949                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1950                 state->id_link_speed = ibd_get_portspeed(state);
1951                 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1952         }
1953 
1954         /*
1955          * Do all the work required to establish our presence on
1956          * the subnet.
1957          */
1958         if (opcode == IBD_LINK_UP_ABSENT) {
1959                 /*
1960                  * If in promiscuous mode ...
1961                  */
1962                 if (state->id_prom_op == IBD_OP_COMPLETED) {
1963                         /*
1964                          * Drop all nonmembership.
1965                          */
1966                         ibd_async_unsetprom(state);
1967 
1968                         /*
1969                          * Then, try to regain nonmembership to all mcg's.
1970                          */
1971                         ibd_async_setprom(state);
1972 
1973                 }
1974 
1975                 /*
1976                  * Drop all sendonly membership (which also gets rid of the
1977                  * AHs); try to reacquire all full membership.
1978                  */
1979                 mce = list_head(&state->id_mc_full);
1980                 while ((pmce = mce) != NULL) {
1981                         mce = list_next(&state->id_mc_full, mce);
1982                         if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1983                                 ibd_leave_group(state,
1984                                     pmce->mc_info.mc_adds_vect.av_dgid,
1985                                     IB_MC_JSTATE_SEND_ONLY_NON);
1986                         else
1987                                 ibd_reacquire_group(state, pmce);
1988                 }
1989 
1990                 /*
1991                  * Recycle all active AHs to free list (and if there are
1992                  * pending posts, make sure they will go into the free list
1993                  * once the Tx's complete). Grab the lock to prevent
1994                  * concurrent Tx's as well as Tx cleanups.
1995                  */
1996                 mutex_enter(&state->id_ac_mutex);
1997                 ace = list_head(&state->id_ah_active);
1998                 while ((pace = ace) != NULL) {
1999                         boolean_t cycled;
2000 
2001                         ace = list_next(&state->id_ah_active, ace);
2002                         mce = pace->ac_mce;
2003                         if (pace->ac_chan != NULL) {
2004                                 ASSERT(mce == NULL);
2005                                 ASSERT(state->id_enable_rc == B_TRUE);
2006                                 if (pace->ac_chan->chan_state ==
2007                                     IBD_RC_STATE_ACT_ESTAB) {
2008                                         INC_REF(pace, 1);
2009                                         IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2010                                         pace->ac_chan->chan_state =
2011                                             IBD_RC_STATE_ACT_CLOSING;
2012                                         ibd_rc_signal_act_close(state, pace);
2013                                 } else {
2014                                         state->rc_act_close_simultaneous++;
2015                                         DPRINT(40, "ibd_async_link: other "
2016                                             "thread is closing it, ace=%p, "
2017                                             "ac_chan=%p, chan_state=%d",
2018                                             pace, pace->ac_chan,
2019                                             pace->ac_chan->chan_state);
2020                                 }
2021                         } else {
2022                                 cycled = ibd_acache_recycle(state,
2023                                     &pace->ac_mac, B_TRUE);
2024                         }
2025                         /*
2026                          * If this is for an mcg, it must be for a fullmember,
2027                          * since we got rid of send-only members above when
2028                          * processing the mce list.
2029                          */
2030                         ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2031                             IB_MC_JSTATE_FULL)));
2032 
2033                         /*
2034                          * Check if the fullmember mce needs to be torn down,
2035                          * ie whether the DLPI disable has already been done.
2036                          * If so, do some of the work of tx_cleanup, namely
2037                          * causing leave (which will fail), detach and
2038                          * mce-freeing. tx_cleanup will put the AH into free
2039                          * list. The reason to duplicate some of this
2040                          * tx_cleanup work is because we want to delete the
2041                          * AH right now instead of waiting for tx_cleanup, to
2042                          * force subsequent Tx's to reacquire an AH.
2043                          */
2044                         if ((mce != NULL) && (mce->mc_fullreap))
2045                                 ibd_async_reap_group(state, mce,
2046                                     mce->mc_info.mc_adds_vect.av_dgid,
2047                                     mce->mc_jstate);
2048                 }
2049                 mutex_exit(&state->id_ac_mutex);
2050         }
2051 
2052         /*
2053          * mac handle is guaranteed to exist since driver does ibt_close_hca()
2054          * (which stops further events from being delivered) before
2055          * mac_unregister(). At this point, it is guaranteed that mac_register
2056          * has already been done.
2057          */
2058         mutex_enter(&state->id_link_mutex);
2059         state->id_link_state = lstate;
2060         mac_link_update(state->id_mh, lstate);
2061         mutex_exit(&state->id_link_mutex);
2062 
2063         ibd_async_done(state);
2064 }
2065 
2066 /*
2067  * Check the pkey table to see if we can find the pkey we're looking for.
2068  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2069  * failure.
2070  */
2071 static int
2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2073     uint16_t *pkix)
2074 {
2075         uint16_t ndx;
2076 
2077         ASSERT(pkix != NULL);
2078 
2079         for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2080                 if (pkey_tbl[ndx] == pkey) {
2081                         *pkix = ndx;
2082                         return (0);
2083                 }
2084         }
2085         return (-1);
2086 }
2087 
2088 /*
2089  * Late HCA Initialization:
2090  * If plumb had succeeded without the availability of an active port or the
2091  * pkey, and either of their availability is now being indicated via PORT_UP
2092  * or PORT_CHANGE respectively, try a start of the interface.
2093  *
2094  * Normal Operation:
2095  * When the link is notified up, we need to do a few things, based
2096  * on the port's current p_init_type_reply claiming a reinit has been
2097  * done or not. The reinit steps are:
2098  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2099  *    the old Pkey and GID0 are correct.
2100  * 2. Register for mcg traps (already done by ibmf).
2101  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2102  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2103  * 4. Give up all sendonly memberships.
2104  * 5. Acquire all full memberships.
2105  * 6. In promiscuous mode, acquire all non memberships.
2106  * 7. Recycle all AHs to free list.
2107  */
2108 static void
2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2110 {
2111         ibt_hca_portinfo_t *port_infop = NULL;
2112         ibt_status_t ibt_status;
2113         uint_t psize, port_infosz;
2114         ibd_link_op_t opcode;
2115         ibd_req_t *req;
2116         link_state_t new_link_state = LINK_STATE_UP;
2117         uint8_t itreply;
2118         uint16_t pkix;
2119         int ret;
2120 
2121         /*
2122          * Let's not race with a plumb or an unplumb; if we detect a
2123          * pkey relocation event later on here, we may have to restart.
2124          */
2125         ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2126 
2127         mutex_enter(&state->id_link_mutex);
2128 
2129         /*
2130          * If the link state is unknown, a plumb has not yet been attempted
2131          * on the interface. Nothing to do.
2132          */
2133         if (state->id_link_state == LINK_STATE_UNKNOWN) {
2134                 mutex_exit(&state->id_link_mutex);
2135                 goto link_mod_return;
2136         }
2137 
2138         /*
2139          * If link state is down because of plumb failure, and we are not in
2140          * late HCA init, and we were not successfully plumbed, nothing to do.
2141          */
2142         if ((state->id_link_state == LINK_STATE_DOWN) &&
2143             ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2144             ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2145                 mutex_exit(&state->id_link_mutex);
2146                 goto link_mod_return;
2147         }
2148 
2149         /*
2150          * If this routine was called in response to a port down event,
2151          * we just need to see if this should be informed.
2152          */
2153         if (code == IBT_ERROR_PORT_DOWN) {
2154                 new_link_state = LINK_STATE_DOWN;
2155                 goto update_link_state;
2156         }
2157 
2158         /*
2159          * If it's not a port down event we've received, try to get the port
2160          * attributes first. If we fail here, the port is as good as down.
2161          * Otherwise, if the link went down by the time the handler gets
2162          * here, give up - we cannot even validate the pkey/gid since those
2163          * are not valid and this is as bad as a port down anyway.
2164          */
2165         ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2166             &port_infop, &psize, &port_infosz);
2167         if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2168             (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2169                 new_link_state = LINK_STATE_DOWN;
2170                 goto update_link_state;
2171         }
2172 
2173         /*
2174          * If in the previous attempt, the pkey was not found either due to the
2175          * port state being down, or due to it's absence in the pkey table,
2176          * look for it now and try to start the interface.
2177          */
2178         if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2179                 mutex_exit(&state->id_link_mutex);
2180                 if ((ret = ibd_start(state)) != 0) {
2181                         DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2182                             "init, ret=%d", ret);
2183                 }
2184                 ibt_free_portinfo(port_infop, port_infosz);
2185                 goto link_mod_return;
2186         }
2187 
2188         /*
2189          * Check the SM InitTypeReply flags. If both NoLoadReply and
2190          * PreserveContentReply are 0, we don't know anything about the
2191          * data loaded into the port attributes, so we need to verify
2192          * if gid0 and pkey are still valid.
2193          */
2194         itreply = port_infop->p_init_type_reply;
2195         if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2196             ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2197                 /*
2198                  * Check to see if the subnet part of GID0 has changed. If
2199                  * not, check the simple case first to see if the pkey
2200                  * index is the same as before; finally check to see if the
2201                  * pkey has been relocated to a different index in the table.
2202                  */
2203                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2204                 if (bcmp(port_infop->p_sgid_tbl,
2205                     &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2206 
2207                         new_link_state = LINK_STATE_DOWN;
2208 
2209                 } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2210                     state->id_pkey) {
2211 
2212                         new_link_state = LINK_STATE_UP;
2213 
2214                 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2215                     port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2216 
2217                         ibt_free_portinfo(port_infop, port_infosz);
2218                         mutex_exit(&state->id_link_mutex);
2219 
2220                         /*
2221                          * Currently a restart is required if our pkey has moved
2222                          * in the pkey table. If we get the ibt_recycle_ud() to
2223                          * work as documented (expected), we may be able to
2224                          * avoid a complete restart.  Note that we've already
2225                          * marked both the start and stop 'in-progress' flags,
2226                          * so it is ok to go ahead and do this restart.
2227                          */
2228                         (void) ibd_undo_start(state, LINK_STATE_DOWN);
2229                         if ((ret = ibd_start(state)) != 0) {
2230                                 DPRINT(10, "ibd_restart: cannot restart, "
2231                                     "ret=%d", ret);
2232                         }
2233 
2234                         goto link_mod_return;
2235                 } else {
2236                         new_link_state = LINK_STATE_DOWN;
2237                 }
2238                 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2239         }
2240 
2241 update_link_state:
2242         if (port_infop) {
2243                 ibt_free_portinfo(port_infop, port_infosz);
2244         }
2245 
2246         /*
2247          * If we're reporting a link up, check InitTypeReply to see if
2248          * the SM has ensured that the port's presence in mcg, traps,
2249          * etc. is intact.
2250          */
2251         if (new_link_state == LINK_STATE_DOWN) {
2252                 opcode = IBD_LINK_DOWN;
2253         } else {
2254                 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2255                     SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2256                         opcode = IBD_LINK_UP;
2257                 } else {
2258                         opcode = IBD_LINK_UP_ABSENT;
2259                 }
2260         }
2261 
2262         /*
2263          * If the old state is the same as the new state, and the SM indicated
2264          * no change in the port parameters, nothing to do.
2265          */
2266         if ((state->id_link_state == new_link_state) && (opcode !=
2267             IBD_LINK_UP_ABSENT)) {
2268                 mutex_exit(&state->id_link_mutex);
2269                 goto link_mod_return;
2270         }
2271 
2272         /*
2273          * Ok, so there was a link state change; see if it's safe to ask
2274          * the async thread to do the work
2275          */
2276         if (!ibd_async_safe(state)) {
2277                 state->id_link_state = new_link_state;
2278                 mutex_exit(&state->id_link_mutex);
2279                 goto link_mod_return;
2280         }
2281 
2282         mutex_exit(&state->id_link_mutex);
2283 
2284         /*
2285          * Queue up a request for ibd_async_link() to handle this link
2286          * state change event
2287          */
2288         req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2289         req->rq_ptr = (void *)opcode;
2290         ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2291 
2292 link_mod_return:
2293         ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2294 }
2295 
2296 /*
2297  * For the port up/down events, IBTL guarantees there will not be concurrent
2298  * invocations of the handler. IBTL might coalesce link transition events,
2299  * and not invoke the handler for _each_ up/down transition, but it will
2300  * invoke the handler with last known state
2301  */
2302 static void
2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2304     ibt_async_code_t code, ibt_async_event_t *event)
2305 {
2306         ibd_state_t *state = (ibd_state_t *)clnt_private;
2307 
2308         switch (code) {
2309         case IBT_ERROR_CATASTROPHIC_CHAN:
2310                 ibd_print_warn(state, "catastrophic channel error");
2311                 break;
2312         case IBT_ERROR_CQ:
2313                 ibd_print_warn(state, "completion queue error");
2314                 break;
2315         case IBT_PORT_CHANGE_EVENT:
2316                 /*
2317                  * Events will be delivered to all instances that have
2318                  * done ibt_open_hca() but not yet done ibt_close_hca().
2319                  * Only need to do work for our port; IBTF will deliver
2320                  * events for other ports on the hca we have ibt_open_hca'ed
2321                  * too. Note that id_port is initialized in ibd_attach()
2322                  * before we do an ibt_open_hca() in ibd_attach().
2323                  */
2324                 ASSERT(state->id_hca_hdl == hca_hdl);
2325                 if (state->id_port != event->ev_port)
2326                         break;
2327 
2328                 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2329                     IBT_PORT_CHANGE_PKEY) {
2330                         ibd_link_mod(state, code);
2331                 }
2332                 break;
2333         case IBT_ERROR_PORT_DOWN:
2334         case IBT_CLNT_REREG_EVENT:
2335         case IBT_EVENT_PORT_UP:
2336                 /*
2337                  * Events will be delivered to all instances that have
2338                  * done ibt_open_hca() but not yet done ibt_close_hca().
2339                  * Only need to do work for our port; IBTF will deliver
2340                  * events for other ports on the hca we have ibt_open_hca'ed
2341                  * too. Note that id_port is initialized in ibd_attach()
2342                  * before we do an ibt_open_hca() in ibd_attach().
2343                  */
2344                 ASSERT(state->id_hca_hdl == hca_hdl);
2345                 if (state->id_port != event->ev_port)
2346                         break;
2347 
2348                 ibd_link_mod(state, code);
2349                 break;
2350 
2351         case IBT_HCA_ATTACH_EVENT:
2352         case IBT_HCA_DETACH_EVENT:
2353                 /*
2354                  * When a new card is plugged to the system, attach_event is
2355                  * invoked. Additionally, a cfgadm needs to be run to make the
2356                  * card known to the system, and an ifconfig needs to be run to
2357                  * plumb up any ibd interfaces on the card. In the case of card
2358                  * unplug, a cfgadm is run that will trigger any RCM scripts to
2359                  * unplumb the ibd interfaces on the card; when the card is
2360                  * actually unplugged, the detach_event is invoked;
2361                  * additionally, if any ibd instances are still active on the
2362                  * card (eg there were no associated RCM scripts), driver's
2363                  * detach routine is invoked.
2364                  */
2365                 break;
2366         default:
2367                 break;
2368         }
2369 }
2370 
2371 static int
2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2373 {
2374         mac_register_t *macp;
2375         int ret;
2376 
2377         if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2378                 DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2379                 return (DDI_FAILURE);
2380         }
2381 
2382         /*
2383          * Note that when we register with mac during attach, we don't
2384          * have the id_macaddr yet, so we'll simply be registering a
2385          * zero macaddr that we'll overwrite later during plumb (in
2386          * ibd_m_start()). Similar is the case with id_mtu - we'll
2387          * update the mac layer with the correct mtu during plumb.
2388          */
2389         macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2390         macp->m_driver = state;
2391         macp->m_dip = dip;
2392         macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2393         macp->m_callbacks = &ibd_m_callbacks;
2394         macp->m_min_sdu = 0;
2395         macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2396         if (state->id_type == IBD_PORT_DRIVER) {
2397                 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2398         } else if (state->id_enable_rc) {
2399                 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2400         } else {
2401                 macp->m_max_sdu = IBD_DEF_MAX_SDU;
2402         }
2403         macp->m_priv_props = ibd_priv_props;
2404 
2405         /*
2406          *  Register ourselves with the GLDv3 interface
2407          */
2408         if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2409                 mac_free(macp);
2410                 DPRINT(10,
2411                     "ibd_register_mac: mac_register() failed, ret=%d", ret);
2412                 return (DDI_FAILURE);
2413         }
2414 
2415         mac_free(macp);
2416         return (DDI_SUCCESS);
2417 }
2418 
2419 static int
2420 ibd_record_capab(ibd_state_t *state)
2421 {
2422         ibt_hca_attr_t hca_attrs;
2423         ibt_status_t ibt_status;
2424 
2425         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2426 
2427         /*
2428          * Query the HCA and fetch its attributes
2429          */
2430         ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2431         ASSERT(ibt_status == IBT_SUCCESS);
2432 
2433         /*
2434          * 1. Set the Hardware Checksum capability. Currently we only consider
2435          *    full checksum offload.
2436          */
2437         if (state->id_enable_rc) {
2438                         state->id_hwcksum_capab = 0;
2439         } else {
2440                 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2441                     == IBT_HCA_CKSUM_FULL) {
2442                         state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2443                 }
2444         }
2445 
2446         /*
2447          * 2. Set LSO policy, capability and maximum length
2448          */
2449         if (state->id_enable_rc) {
2450                 state->id_lso_capable = B_FALSE;
2451                 state->id_lso_maxlen = 0;
2452         } else {
2453                 if (hca_attrs.hca_max_lso_size > 0) {
2454                         state->id_lso_capable = B_TRUE;
2455                         if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2456                                 state->id_lso_maxlen = IBD_LSO_MAXLEN;
2457                         else
2458                                 state->id_lso_maxlen =
2459                                     hca_attrs.hca_max_lso_size;
2460                 } else {
2461                         state->id_lso_capable = B_FALSE;
2462                         state->id_lso_maxlen = 0;
2463                 }
2464         }
2465 
2466         /*
2467          * 3. Set Reserved L_Key capability
2468          */
2469         if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2470                 state->id_hca_res_lkey_capab = 1;
2471                 state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2472                 state->rc_enable_iov_map = B_TRUE;
2473         } else {
2474                 /* If no reserved lkey, we will not use ibt_map_mem_iov */
2475                 state->rc_enable_iov_map = B_FALSE;
2476         }
2477 
2478         /*
2479          * 4. Set maximum sqseg value after checking to see if extended sgl
2480          *    size information is provided by the hca
2481          */
2482         if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2483                 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2484                 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2485         } else {
2486                 state->id_max_sqseg = hca_attrs.hca_max_sgl;
2487                 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2488         }
2489         if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2490                 state->id_max_sqseg = IBD_MAX_SQSEG;
2491         } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2492                 ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2493                     state->id_max_sqseg, IBD_MAX_SQSEG);
2494         }
2495         if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2496                 state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2497         } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2498                 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2499                     "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2500         }
2501 
2502         /*
2503          * Translating the virtual address regions into physical regions
2504          * for using the Reserved LKey feature results in a wr sgl that
2505          * is a little longer. Since failing ibt_map_mem_iov() is costly,
2506          * we'll fix a high-water mark (65%) for when we should stop.
2507          */
2508         state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2509         state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2510 
2511         /*
2512          * 5. Set number of recv and send wqes after checking hca maximum
2513          *    channel size. Store the max channel size in the state so that it
2514          *    can be referred to when the swqe/rwqe change is requested via
2515          *    dladm.
2516          */
2517 
2518         state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2519 
2520         if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2521                 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2522 
2523         state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2524             IBD_RWQE_MIN;
2525 
2526         if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2527                 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2528 
2529         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2530 
2531         return (DDI_SUCCESS);
2532 }
2533 
2534 static int
2535 ibd_part_busy(ibd_state_t *state)
2536 {
2537         if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2538                 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2539                 return (DDI_FAILURE);
2540         }
2541 
2542         if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2543                 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2544                 return (DDI_FAILURE);
2545         }
2546 
2547         /*
2548          * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2549          * connecting to a remote IPoIB port. We can't remove this port.
2550          */
2551         if (state->id_ah_op == IBD_OP_ONGOING) {
2552                 DPRINT(10, "ibd_part_busy: failed: connecting\n");
2553                 return (DDI_FAILURE);
2554         }
2555 
2556         return (DDI_SUCCESS);
2557 }
2558 
2559 
2560 static void
2561 ibd_part_unattach(ibd_state_t *state)
2562 {
2563         uint32_t progress = state->id_mac_state;
2564         ibt_status_t ret;
2565 
2566         /* make sure rx resources are freed */
2567         ibd_free_rx_rsrcs(state);
2568 
2569         if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2570                 ASSERT(state->id_enable_rc);
2571                 ibd_rc_fini_srq_list(state);
2572                 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2573         }
2574 
2575         if (progress & IBD_DRV_MAC_REGISTERED) {
2576                 (void) mac_unregister(state->id_mh);
2577                 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2578         }
2579 
2580         if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2581                 /*
2582                  * No new async requests will be posted since the device
2583                  * link state has been marked as unknown; completion handlers
2584                  * have been turned off, so Tx handler will not cause any
2585                  * more IBD_ASYNC_REAP requests.
2586                  *
2587                  * Queue a request for the async thread to exit, which will
2588                  * be serviced after any pending ones. This can take a while,
2589                  * specially if the SM is unreachable, since IBMF will slowly
2590                  * timeout each SM request issued by the async thread.  Reap
2591                  * the thread before continuing on, we do not want it to be
2592                  * lingering in modunloaded code.
2593                  */
2594                 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2595                 thread_join(state->id_async_thrid);
2596 
2597                 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2598         }
2599 
2600         if (progress & IBD_DRV_REQ_LIST_INITED) {
2601                 list_destroy(&state->id_req_list);
2602                 mutex_destroy(&state->id_acache_req_lock);
2603                 cv_destroy(&state->id_acache_req_cv);
2604                 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2605         }
2606 
2607         if (progress & IBD_DRV_PD_ALLOCD) {
2608                 if ((ret = ibt_free_pd(state->id_hca_hdl,
2609                     state->id_pd_hdl)) != IBT_SUCCESS) {
2610                         ibd_print_warn(state, "failed to free "
2611                             "protection domain, ret=%d", ret);
2612                 }
2613                 state->id_pd_hdl = NULL;
2614                 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2615         }
2616 
2617         if (progress & IBD_DRV_HCA_OPENED) {
2618                 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2619                     IBT_SUCCESS) {
2620                         ibd_print_warn(state, "failed to close "
2621                             "HCA device, ret=%d", ret);
2622                 }
2623                 state->id_hca_hdl = NULL;
2624                 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2625         }
2626 
2627         mutex_enter(&ibd_gstate.ig_mutex);
2628         if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2629                 if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2630                     IBT_SUCCESS) {
2631                         ibd_print_warn(state,
2632                             "ibt_detach() failed, ret=%d", ret);
2633                 }
2634                 state->id_ibt_hdl = NULL;
2635                 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2636                 ibd_gstate.ig_ibt_hdl_ref_cnt--;
2637         }
2638         if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2639             (ibd_gstate.ig_ibt_hdl != NULL)) {
2640                 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2641                     IBT_SUCCESS) {
2642                         ibd_print_warn(state, "ibt_detach(): global "
2643                             "failed, ret=%d", ret);
2644                 }
2645                 ibd_gstate.ig_ibt_hdl = NULL;
2646         }
2647         mutex_exit(&ibd_gstate.ig_mutex);
2648 
2649         if (progress & IBD_DRV_TXINTR_ADDED) {
2650                 ddi_remove_softintr(state->id_tx);
2651                 state->id_tx = NULL;
2652                 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2653         }
2654 
2655         if (progress & IBD_DRV_RXINTR_ADDED) {
2656                 ddi_remove_softintr(state->id_rx);
2657                 state->id_rx = NULL;
2658                 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2659         }
2660 
2661 #ifdef DEBUG
2662         if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2663                 kstat_delete(state->rc_ksp);
2664                 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2665         }
2666 #endif
2667 
2668         if (progress & IBD_DRV_STATE_INITIALIZED) {
2669                 ibd_state_fini(state);
2670                 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2671         }
2672 }
2673 
2674 int
2675 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2676 {
2677         ibt_status_t ret;
2678         int rv;
2679         kthread_t *kht;
2680 
2681         /*
2682          * Initialize mutexes and condition variables
2683          */
2684         if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2685                 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2686                 return (DDI_FAILURE);
2687         }
2688         state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2689 
2690         /*
2691          * Allocate rx,tx softintr
2692          */
2693         if (ibd_rx_softintr == 1) {
2694                 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2695                     NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2696                         DPRINT(10, "ibd_part_attach: failed in "
2697                             "ddi_add_softintr(id_rx),  ret=%d", rv);
2698                         return (DDI_FAILURE);
2699                 }
2700                 state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2701         }
2702         if (ibd_tx_softintr == 1) {
2703                 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2704                     NULL, NULL, ibd_tx_recycle,
2705                     (caddr_t)state)) != DDI_SUCCESS) {
2706                         DPRINT(10, "ibd_part_attach: failed in "
2707                             "ddi_add_softintr(id_tx), ret=%d", rv);
2708                         return (DDI_FAILURE);
2709                 }
2710                 state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2711         }
2712 
2713         /*
2714          * Attach to IBTL
2715          */
2716         mutex_enter(&ibd_gstate.ig_mutex);
2717         if (ibd_gstate.ig_ibt_hdl == NULL) {
2718                 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2719                     &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2720                         DPRINT(10, "ibd_part_attach: global: failed in "
2721                             "ibt_attach(), ret=%d", ret);
2722                         mutex_exit(&ibd_gstate.ig_mutex);
2723                         return (DDI_FAILURE);
2724                 }
2725         }
2726         if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2727             &state->id_ibt_hdl)) != IBT_SUCCESS) {
2728                 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2729                     ret);
2730                 mutex_exit(&ibd_gstate.ig_mutex);
2731                 return (DDI_FAILURE);
2732         }
2733         ibd_gstate.ig_ibt_hdl_ref_cnt++;
2734         mutex_exit(&ibd_gstate.ig_mutex);
2735         state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2736 
2737         /*
2738          * Open the HCA
2739          */
2740         if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2741             &state->id_hca_hdl)) != IBT_SUCCESS) {
2742                 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2743                     ret);
2744                 return (DDI_FAILURE);
2745         }
2746         state->id_mac_state |= IBD_DRV_HCA_OPENED;
2747 
2748 #ifdef DEBUG
2749         /* Initialize Driver Counters for Reliable Connected Mode */
2750         if (state->id_enable_rc) {
2751                 if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2752                         DPRINT(10, "ibd_part_attach: failed in "
2753                             "ibd_rc_init_stats");
2754                         return (DDI_FAILURE);
2755                 }
2756                 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2757         }
2758 #endif
2759 
2760         /*
2761          * Record capabilities
2762          */
2763         (void) ibd_record_capab(state);
2764 
2765         /*
2766          * Allocate a protection domain on the HCA
2767          */
2768         if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2769             &state->id_pd_hdl)) != IBT_SUCCESS) {
2770                 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2771                     ret);
2772                 return (DDI_FAILURE);
2773         }
2774         state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2775 
2776 
2777         /*
2778          * We need to initialise the req_list that is required for the
2779          * operation of the async_thread.
2780          */
2781         mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2782         cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2783         list_create(&state->id_req_list, sizeof (ibd_req_t),
2784             offsetof(ibd_req_t, rq_list));
2785         state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2786 
2787         /*
2788          * Create the async thread; thread_create never fails.
2789          */
2790         kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2791             TS_RUN, minclsyspri);
2792         state->id_async_thrid = kht->t_did;
2793         state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2794 
2795         return (DDI_SUCCESS);
2796 }
2797 
2798 /*
2799  * Attach device to the IO framework.
2800  */
2801 static int
2802 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2803 {
2804         int ret;
2805 
2806         switch (cmd) {
2807                 case DDI_ATTACH:
2808                         ret = ibd_port_attach(dip);
2809                         break;
2810                 default:
2811                         ret = DDI_FAILURE;
2812                         break;
2813         }
2814         return (ret);
2815 }
2816 
2817 /*
2818  * Detach device from the IO framework.
2819  */
2820 static int
2821 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2822 {
2823         ibd_state_t *state;
2824         int instance;
2825 
2826         /*
2827          * IBD doesn't support suspend/resume
2828          */
2829         if (cmd != DDI_DETACH)
2830                 return (DDI_FAILURE);
2831 
2832         /*
2833          * Get the instance softstate
2834          */
2835         instance = ddi_get_instance(dip);
2836         state = ddi_get_soft_state(ibd_list, instance);
2837 
2838         /*
2839          * Release all resources we're holding still.  Note that if we'd
2840          * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2841          * so far, we should find all the flags we need in id_mac_state.
2842          */
2843         return (ibd_port_unattach(state, dip));
2844 }
2845 
2846 /*
2847  * Pre ibt_attach() driver initialization
2848  */
2849 static int
2850 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2851 {
2852         char buf[64];
2853 
2854         mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2855         state->id_link_state = LINK_STATE_UNKNOWN;
2856 
2857         mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2858         cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2859         state->id_trap_stop = B_TRUE;
2860         state->id_trap_inprog = 0;
2861 
2862         mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2863         mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2864         state->id_dip = dip;
2865 
2866         mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2867 
2868         mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2869         mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2870         mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2871         state->id_tx_busy = 0;
2872         mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2873 
2874         state->id_rx_list.dl_bufs_outstanding = 0;
2875         state->id_rx_list.dl_cnt = 0;
2876         mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2877         mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2878         (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2879             state->id_pkey, state->id_plinkid);
2880         state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2881             0, NULL, NULL, NULL, NULL, NULL, 0);
2882 
2883         /* For Reliable Connected Mode */
2884         mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2885         mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2886         mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2887         mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2888         mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2889             MUTEX_DRIVER, NULL);
2890         mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2891 
2892         /*
2893          * Make the default link mode as RC. If this fails during connection
2894          * setup, the link mode is automatically transitioned to UD.
2895          * Also set the RC MTU.
2896          */
2897         state->id_enable_rc = IBD_DEF_LINK_MODE;
2898         state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2899         state->id_mtu = IBD_DEF_MAX_MTU;
2900 
2901         /* Iniatialize all tunables to default */
2902         state->id_lso_policy = IBD_DEF_LSO_POLICY;
2903         state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2904         state->id_num_ah = IBD_DEF_NUM_AH;
2905         state->id_hash_size = IBD_DEF_HASH_SIZE;
2906         state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2907         state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2908         state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2909         state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2910         state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2911         state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2912         state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2913         state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2914         state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2915         state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2916         state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2917         state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2918         state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2919         state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2920         state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2921         state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2922         state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2923         state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2924         state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2925         state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2926 
2927         return (DDI_SUCCESS);
2928 }
2929 
2930 /*
2931  * Post ibt_detach() driver deconstruction
2932  */
2933 static void
2934 ibd_state_fini(ibd_state_t *state)
2935 {
2936         kmem_cache_destroy(state->id_req_kmc);
2937 
2938         mutex_destroy(&state->id_rx_list.dl_mutex);
2939         mutex_destroy(&state->id_rx_free_list.dl_mutex);
2940 
2941         mutex_destroy(&state->id_txpost_lock);
2942         mutex_destroy(&state->id_tx_list.dl_mutex);
2943         mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2944         mutex_destroy(&state->id_lso_lock);
2945 
2946         mutex_destroy(&state->id_sched_lock);
2947         mutex_destroy(&state->id_scq_poll_lock);
2948         mutex_destroy(&state->id_rcq_poll_lock);
2949 
2950         cv_destroy(&state->id_trap_cv);
2951         mutex_destroy(&state->id_trap_lock);
2952         mutex_destroy(&state->id_link_mutex);
2953 
2954         /* For Reliable Connected Mode */
2955         mutex_destroy(&state->rc_timeout_lock);
2956         mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2957         mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2958         mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2959         mutex_destroy(&state->rc_tx_large_bufs_lock);
2960         mutex_destroy(&state->rc_rx_lock);
2961 }
2962 
2963 /*
2964  * Fetch link speed from SA for snmp ifspeed reporting.
2965  */
2966 static uint64_t
2967 ibd_get_portspeed(ibd_state_t *state)
2968 {
2969         int                     ret;
2970         ibt_path_info_t         path;
2971         ibt_path_attr_t         path_attr;
2972         uint8_t                 num_paths;
2973         uint64_t                ifspeed;
2974 
2975         /*
2976          * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2977          * translates to 2 Gbps data rate. Thus, 1X single data rate is
2978          * 2000000000. Start with that as default.
2979          */
2980         ifspeed = 2000000000;
2981 
2982         bzero(&path_attr, sizeof (path_attr));
2983 
2984         /*
2985          * Get the port speed from Loopback path information.
2986          */
2987         path_attr.pa_dgids = &state->id_sgid;
2988         path_attr.pa_num_dgids = 1;
2989         path_attr.pa_sgid = state->id_sgid;
2990 
2991         if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2992             &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2993                 goto earlydone;
2994 
2995         if (num_paths < 1)
2996                 goto earlydone;
2997 
2998         /*
2999          * In case SA does not return an expected value, report the default
3000          * speed as 1X.
3001          */
3002         ret = 1;
3003         switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3004                 case IBT_SRATE_2:       /*  1X SDR i.e 2.5 Gbps */
3005                         ret = 1;
3006                         break;
3007                 case IBT_SRATE_10:      /*  4X SDR or 1X QDR i.e 10 Gbps */
3008                         ret = 4;
3009                         break;
3010                 case IBT_SRATE_30:      /* 12X SDR i.e 30 Gbps */
3011                         ret = 12;
3012                         break;
3013                 case IBT_SRATE_5:       /*  1X DDR i.e  5 Gbps */
3014                         ret = 2;
3015                         break;
3016                 case IBT_SRATE_20:      /*  4X DDR or 8X SDR i.e 20 Gbps */
3017                         ret = 8;
3018                         break;
3019                 case IBT_SRATE_40:      /*  8X DDR or 4X QDR i.e 40 Gbps */
3020                         ret = 16;
3021                         break;
3022                 case IBT_SRATE_60:      /* 12X DDR i.e 60 Gbps */
3023                         ret = 24;
3024                         break;
3025                 case IBT_SRATE_80:      /*  8X QDR i.e 80 Gbps */
3026                         ret = 32;
3027                         break;
3028                 case IBT_SRATE_120:     /* 12X QDR i.e 120 Gbps */
3029                         ret = 48;
3030                         break;
3031         }
3032 
3033         ifspeed *= ret;
3034 
3035 earlydone:
3036         return (ifspeed);
3037 }
3038 
3039 /*
3040  * Search input mcg list (id_mc_full or id_mc_non) for an entry
3041  * representing the input mcg mgid.
3042  */
3043 static ibd_mce_t *
3044 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3045 {
3046         ibd_mce_t *ptr = list_head(mlist);
3047 
3048         /*
3049          * Do plain linear search.
3050          */
3051         while (ptr != NULL) {
3052                 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3053                     sizeof (ib_gid_t)) == 0)
3054                         return (ptr);
3055                 ptr = list_next(mlist, ptr);
3056         }
3057         return (NULL);
3058 }
3059 
3060 /*
3061  * Execute IBA JOIN.
3062  */
3063 static ibt_status_t
3064 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3065 {
3066         ibt_mcg_attr_t mcg_attr;
3067 
3068         bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3069         mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3070         mcg_attr.mc_mgid = mgid;
3071         mcg_attr.mc_join_state = mce->mc_jstate;
3072         mcg_attr.mc_scope = state->id_scope;
3073         mcg_attr.mc_pkey = state->id_pkey;
3074         mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3075         mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3076         mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3077         return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3078             NULL, NULL));
3079 }
3080 
3081 /*
3082  * This code JOINs the port in the proper way (depending on the join
3083  * state) so that IBA fabric will forward mcg packets to/from the port.
3084  * It also attaches the QPN to the mcg so it can receive those mcg
3085  * packets. This code makes sure not to attach the mcg to the QP if
3086  * that has been previously done due to the mcg being joined with a
3087  * different join state, even though this is not required by SWG_0216,
3088  * refid 3610.
3089  */
3090 static ibd_mce_t *
3091 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3092 {
3093         ibt_status_t ibt_status;
3094         ibd_mce_t *mce, *tmce, *omce = NULL;
3095         boolean_t do_attach = B_TRUE;
3096 
3097         DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3098             jstate, mgid.gid_prefix, mgid.gid_guid);
3099 
3100         /*
3101          * For enable_multicast Full member joins, we need to do some
3102          * extra work. If there is already an mce on the list that
3103          * indicates full membership, that means the membership has
3104          * not yet been dropped (since the disable_multicast was issued)
3105          * because there are pending Tx's to the mcg; in that case, just
3106          * mark the mce not to be reaped when the Tx completion queues
3107          * an async reap operation.
3108          *
3109          * If there is already an mce on the list indicating sendonly
3110          * membership, try to promote to full membership. Be careful
3111          * not to deallocate the old mce, since there might be an AH
3112          * pointing to it; instead, update the old mce with new data
3113          * that tracks the full membership.
3114          */
3115         if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3116             IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3117                 if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3118                         ASSERT(omce->mc_fullreap);
3119                         omce->mc_fullreap = B_FALSE;
3120                         return (omce);
3121                 } else {
3122                         ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3123                 }
3124         }
3125 
3126         /*
3127          * Allocate the ibd_mce_t to track this JOIN.
3128          */
3129         mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3130         mce->mc_fullreap = B_FALSE;
3131         mce->mc_jstate = jstate;
3132 
3133         if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3134                 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3135                     ibt_status);
3136                 kmem_free(mce, sizeof (ibd_mce_t));
3137                 return (NULL);
3138         }
3139 
3140         /*
3141          * Is an IBA attach required? Not if the interface is already joined
3142          * to the mcg in a different appropriate join state.
3143          */
3144         if (jstate == IB_MC_JSTATE_NON) {
3145                 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3146                 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3147                         do_attach = B_FALSE;
3148         } else if (jstate == IB_MC_JSTATE_FULL) {
3149                 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3150                         do_attach = B_FALSE;
3151         } else {        /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3152                 do_attach = B_FALSE;
3153         }
3154 
3155         if (do_attach) {
3156                 /*
3157                  * Do the IBA attach.
3158                  */
3159                 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3160                 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3161                     &mce->mc_info)) != IBT_SUCCESS) {
3162                         DPRINT(10, "ibd_join_group : failed qp attachment "
3163                             "%d\n", ibt_status);
3164                         /*
3165                          * NOTE that we should probably preserve the join info
3166                          * in the list and later try to leave again at detach
3167                          * time.
3168                          */
3169                         (void) ibt_leave_mcg(state->id_sgid, mgid,
3170                             state->id_sgid, jstate);
3171                         kmem_free(mce, sizeof (ibd_mce_t));
3172                         return (NULL);
3173                 }
3174         }
3175 
3176         /*
3177          * Insert the ibd_mce_t in the proper list.
3178          */
3179         if (jstate == IB_MC_JSTATE_NON) {
3180                 IBD_MCACHE_INSERT_NON(state, mce);
3181         } else {
3182                 /*
3183                  * Set up the mc_req fields used for reaping the
3184                  * mcg in case of delayed tx completion (see
3185                  * ibd_tx_cleanup()). Also done for sendonly join in
3186                  * case we are promoted to fullmembership later and
3187                  * keep using the same mce.
3188                  */
3189                 mce->mc_req.rq_gid = mgid;
3190                 mce->mc_req.rq_ptr = mce;
3191                 /*
3192                  * Check whether this is the case of trying to join
3193                  * full member, and we were already joined send only.
3194                  * We try to drop our SendOnly membership, but it is
3195                  * possible that the mcg does not exist anymore (and
3196                  * the subnet trap never reached us), so the leave
3197                  * operation might fail.
3198                  */
3199                 if (omce != NULL) {
3200                         (void) ibt_leave_mcg(state->id_sgid, mgid,
3201                             state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3202                         omce->mc_jstate = IB_MC_JSTATE_FULL;
3203                         bcopy(&mce->mc_info, &omce->mc_info,
3204                             sizeof (ibt_mcg_info_t));
3205                         kmem_free(mce, sizeof (ibd_mce_t));
3206                         return (omce);
3207                 }
3208                 mutex_enter(&state->id_mc_mutex);
3209                 IBD_MCACHE_INSERT_FULL(state, mce);
3210                 mutex_exit(&state->id_mc_mutex);
3211         }
3212 
3213         return (mce);
3214 }
3215 
3216 /*
3217  * Called during port up event handling to attempt to reacquire full
3218  * membership to an mcg. Stripped down version of ibd_join_group().
3219  * Note that it is possible that the mcg might have gone away, and
3220  * gets recreated at this point.
3221  */
3222 static void
3223 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3224 {
3225         ib_gid_t mgid;
3226 
3227         /*
3228          * If the mc_fullreap flag is set, or this join fails, a subsequent
3229          * reap/leave is going to try to leave the group. We could prevent
3230          * that by adding a boolean flag into ibd_mce_t, if required.
3231          */
3232         if (mce->mc_fullreap)
3233                 return;
3234 
3235         mgid = mce->mc_info.mc_adds_vect.av_dgid;
3236 
3237         DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3238             mgid.gid_guid);
3239 
3240         /* While reacquiring, leave and then join the MCG */
3241         (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3242             mce->mc_jstate);
3243         if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3244                 ibd_print_warn(state, "Failure on port up to rejoin "
3245                     "multicast gid %016llx:%016llx",
3246                     (u_longlong_t)mgid.gid_prefix,
3247                     (u_longlong_t)mgid.gid_guid);
3248 }
3249 
3250 /*
3251  * This code handles delayed Tx completion cleanups for mcg's to which
3252  * disable_multicast has been issued, regular mcg related cleanups during
3253  * disable_multicast, disable_promiscuous and mcg traps, as well as
3254  * cleanups during driver detach time. Depending on the join state,
3255  * it deletes the mce from the appropriate list and issues the IBA
3256  * leave/detach; except in the disable_multicast case when the mce
3257  * is left on the active list for a subsequent Tx completion cleanup.
3258  */
3259 static void
3260 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3261     uint8_t jstate)
3262 {
3263         ibd_mce_t *tmce;
3264         boolean_t do_detach = B_TRUE;
3265 
3266         /*
3267          * Before detaching, we must check whether the other list
3268          * contains the mcg; if we detach blindly, the consumer
3269          * who set up the other list will also stop receiving
3270          * traffic.
3271          */
3272         if (jstate == IB_MC_JSTATE_FULL) {
3273                 /*
3274                  * The following check is only relevant while coming
3275                  * from the Tx completion path in the reap case.
3276                  */
3277                 if (!mce->mc_fullreap)
3278                         return;
3279                 mutex_enter(&state->id_mc_mutex);
3280                 IBD_MCACHE_PULLOUT_FULL(state, mce);
3281                 mutex_exit(&state->id_mc_mutex);
3282                 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3283                         do_detach = B_FALSE;
3284         } else if (jstate == IB_MC_JSTATE_NON) {
3285                 IBD_MCACHE_PULLOUT_NON(state, mce);
3286                 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3287                 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3288                         do_detach = B_FALSE;
3289         } else {        /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3290                 mutex_enter(&state->id_mc_mutex);
3291                 IBD_MCACHE_PULLOUT_FULL(state, mce);
3292                 mutex_exit(&state->id_mc_mutex);
3293                 do_detach = B_FALSE;
3294         }
3295 
3296         /*
3297          * If we are reacting to a mcg trap and leaving our sendonly or
3298          * non membership, the mcg is possibly already gone, so attempting
3299          * to leave might fail. On the other hand, we must try to leave
3300          * anyway, since this might be a trap from long ago, and we could
3301          * have potentially sendonly joined to a recent incarnation of
3302          * the mcg and are about to loose track of this information.
3303          */
3304         if (do_detach) {
3305                 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3306                     "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3307                 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3308         }
3309 
3310         (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3311         kmem_free(mce, sizeof (ibd_mce_t));
3312 }
3313 
3314 /*
3315  * Async code executed due to multicast and promiscuous disable requests
3316  * and mcg trap handling; also executed during driver detach. Mostly, a
3317  * leave and detach is done; except for the fullmember case when Tx
3318  * requests are pending, whence arrangements are made for subsequent
3319  * cleanup on Tx completion.
3320  */
3321 static void
3322 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3323 {
3324         ipoib_mac_t mcmac;
3325         boolean_t recycled;
3326         ibd_mce_t *mce;
3327 
3328         DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3329             jstate, mgid.gid_prefix, mgid.gid_guid);
3330 
3331         if (jstate == IB_MC_JSTATE_NON) {
3332                 recycled = B_TRUE;
3333                 mce = IBD_MCACHE_FIND_NON(state, mgid);
3334                 /*
3335                  * In case we are handling a mcg trap, we might not find
3336                  * the mcg in the non list.
3337                  */
3338                 if (mce == NULL) {
3339                         return;
3340                 }
3341         } else {
3342                 mce = IBD_MCACHE_FIND_FULL(state, mgid);
3343 
3344                 /*
3345                  * In case we are handling a mcg trap, make sure the trap
3346                  * is not arriving late; if we have an mce that indicates
3347                  * that we are already a fullmember, that would be a clear
3348                  * indication that the trap arrived late (ie, is for a
3349                  * previous incarnation of the mcg).
3350                  */
3351                 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3352                         if ((mce == NULL) || (mce->mc_jstate ==
3353                             IB_MC_JSTATE_FULL)) {
3354                                 return;
3355                         }
3356                 } else {
3357                         ASSERT(jstate == IB_MC_JSTATE_FULL);
3358 
3359                         /*
3360                          * If join group failed, mce will be NULL here.
3361                          * This is because in GLDv3 driver, set multicast
3362                          *  will always return success.
3363                          */
3364                         if (mce == NULL) {
3365                                 return;
3366                         }
3367 
3368                         mce->mc_fullreap = B_TRUE;
3369                 }
3370 
3371                 /*
3372                  * If no pending Tx's remain that reference the AH
3373                  * for the mcg, recycle it from active to free list.
3374                  * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3375                  * so the last completing Tx will cause an async reap
3376                  * operation to be invoked, at which time we will drop our
3377                  * membership to the mcg so that the pending Tx's complete
3378                  * successfully. Refer to comments on "AH and MCE active
3379                  * list manipulation" at top of this file. The lock protects
3380                  * against Tx fast path and Tx cleanup code.
3381                  */
3382                 mutex_enter(&state->id_ac_mutex);
3383                 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3384                 recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3385                     IB_MC_JSTATE_SEND_ONLY_NON));
3386                 mutex_exit(&state->id_ac_mutex);
3387         }
3388 
3389         if (recycled) {
3390                 DPRINT(2, "ibd_leave_group : leave_group reaping : "
3391                     "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3392                 ibd_async_reap_group(state, mce, mgid, jstate);
3393         }
3394 }
3395 
3396 /*
3397  * Find the broadcast address as defined by IPoIB; implicitly
3398  * determines the IBA scope, mtu, tclass etc of the link the
3399  * interface is going to be a member of.
3400  */
3401 static ibt_status_t
3402 ibd_find_bgroup(ibd_state_t *state)
3403 {
3404         ibt_mcg_attr_t mcg_attr;
3405         uint_t numg;
3406         uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3407             IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3408             IB_MC_SCOPE_GLOBAL };
3409         int i, mcgmtu;
3410         boolean_t found = B_FALSE;
3411         int ret;
3412         ibt_mcg_info_t mcg_info;
3413 
3414         state->id_bgroup_created = B_FALSE;
3415         state->id_bgroup_present = B_FALSE;
3416 
3417 query_bcast_grp:
3418         bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3419         mcg_attr.mc_pkey = state->id_pkey;
3420         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3421         state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3422         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3423 
3424         for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3425                 state->id_scope = mcg_attr.mc_scope = scopes[i];
3426 
3427                 /*
3428                  * Look for the IPoIB broadcast group.
3429                  */
3430                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3431                 state->id_mgid.gid_prefix =
3432                     (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3433                     ((uint64_t)state->id_scope << 48) |
3434                     ((uint32_t)(state->id_pkey << 16)));
3435                 mcg_attr.mc_mgid = state->id_mgid;
3436                 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3437                 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3438                     &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3439                         found = B_TRUE;
3440                         break;
3441                 }
3442         }
3443 
3444         if (!found) {
3445                 if (state->id_create_broadcast_group) {
3446                         /*
3447                          * If we created the broadcast group, but failed to
3448                          * find it, we can't do anything except leave the
3449                          * one we created and return failure.
3450                          */
3451                         if (state->id_bgroup_created) {
3452                                 ibd_print_warn(state, "IPoIB broadcast group "
3453                                     "absent. Unable to query after create.");
3454                                 goto find_bgroup_fail;
3455                         }
3456 
3457                         /*
3458                          * Create the ipoib broadcast group if it didn't exist
3459                          */
3460                         bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3461                         mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3462                         mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3463                         mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3464                         mcg_attr.mc_pkey = state->id_pkey;
3465                         mcg_attr.mc_flow = 0;
3466                         mcg_attr.mc_sl = 0;
3467                         mcg_attr.mc_tclass = 0;
3468                         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3469                         state->id_mgid.gid_prefix =
3470                             (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3471                             ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3472                             ((uint32_t)(state->id_pkey << 16)));
3473                         mcg_attr.mc_mgid = state->id_mgid;
3474                         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3475 
3476                         if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3477                             &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3478                                 ibd_print_warn(state, "IPoIB broadcast group "
3479                                     "absent, create failed: ret = %d\n", ret);
3480                                 state->id_bgroup_created = B_FALSE;
3481                                 return (IBT_FAILURE);
3482                         }
3483                         state->id_bgroup_created = B_TRUE;
3484                         goto query_bcast_grp;
3485                 } else {
3486                         ibd_print_warn(state, "IPoIB broadcast group absent");
3487                         return (IBT_FAILURE);
3488                 }
3489         }
3490 
3491         /*
3492          * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3493          */
3494         mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3495         if (state->id_mtu < mcgmtu) {
3496                 ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3497                     "greater than port's maximum MTU %d", mcgmtu,
3498                     state->id_mtu);
3499                 ibt_free_mcg_info(state->id_mcinfo, 1);
3500                 goto find_bgroup_fail;
3501         }
3502         state->id_mtu = mcgmtu;
3503         state->id_bgroup_present = B_TRUE;
3504 
3505         return (IBT_SUCCESS);
3506 
3507 find_bgroup_fail:
3508         if (state->id_bgroup_created) {
3509                 (void) ibt_leave_mcg(state->id_sgid,
3510                     mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3511                     IB_MC_JSTATE_FULL);
3512         }
3513 
3514         return (IBT_FAILURE);
3515 }
3516 
3517 static int
3518 ibd_alloc_tx_copybufs(ibd_state_t *state)
3519 {
3520         ibt_mr_attr_t mem_attr;
3521 
3522         /*
3523          * Allocate one big chunk for all regular tx copy bufs
3524          */
3525         state->id_tx_buf_sz = state->id_mtu;
3526         if (state->id_lso_policy && state->id_lso_capable &&
3527             (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3528                 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3529         }
3530 
3531         state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3532             state->id_tx_buf_sz, KM_SLEEP);
3533 
3534         state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3535             sizeof (ibd_swqe_t), KM_SLEEP);
3536 
3537         /*
3538          * Do one memory registration on the entire txbuf area
3539          */
3540         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3541         mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3542         mem_attr.mr_as = NULL;
3543         mem_attr.mr_flags = IBT_MR_SLEEP;
3544         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3545             &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3546                 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3547                 kmem_free(state->id_tx_wqes,
3548                     state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3549                 kmem_free(state->id_tx_bufs,
3550                     state->id_ud_num_swqe * state->id_tx_buf_sz);
3551                 state->id_tx_bufs = NULL;
3552                 return (DDI_FAILURE);
3553         }
3554 
3555         return (DDI_SUCCESS);
3556 }
3557 
3558 static int
3559 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3560 {
3561         ibt_mr_attr_t mem_attr;
3562         ibd_lsobuf_t *buflist;
3563         ibd_lsobuf_t *lbufp;
3564         ibd_lsobuf_t *tail;
3565         ibd_lsobkt_t *bktp;
3566         uint8_t *membase;
3567         uint8_t *memp;
3568         uint_t memsz;
3569         int i;
3570 
3571         /*
3572          * Allocate the lso bucket
3573          */
3574         bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3575 
3576         /*
3577          * Allocate the entire lso memory and register it
3578          */
3579         memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3580         membase = kmem_zalloc(memsz, KM_SLEEP);
3581 
3582         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3583         mem_attr.mr_len = memsz;
3584         mem_attr.mr_as = NULL;
3585         mem_attr.mr_flags = IBT_MR_SLEEP;
3586         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3587             &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3588                 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3589                 kmem_free(membase, memsz);
3590                 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3591                 return (DDI_FAILURE);
3592         }
3593 
3594         mutex_enter(&state->id_lso_lock);
3595 
3596         /*
3597          * Now allocate the buflist.  Note that the elements in the buflist and
3598          * the buffers in the lso memory have a permanent 1-1 relation, so we
3599          * can always derive the address of a buflist entry from the address of
3600          * an lso buffer.
3601          */
3602         buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3603             KM_SLEEP);
3604 
3605         /*
3606          * Set up the lso buf chain
3607          */
3608         memp = membase;
3609         lbufp = buflist;
3610         for (i = 0; i < state->id_num_lso_bufs; i++) {
3611                 lbufp->lb_isfree = 1;
3612                 lbufp->lb_buf = memp;
3613                 lbufp->lb_next = lbufp + 1;
3614 
3615                 tail = lbufp;
3616 
3617                 memp += IBD_LSO_BUFSZ;
3618                 lbufp++;
3619         }
3620         tail->lb_next = NULL;
3621 
3622         /*
3623          * Set up the LSO buffer information in ibd state
3624          */
3625         bktp->bkt_bufl = buflist;
3626         bktp->bkt_free_head = buflist;
3627         bktp->bkt_mem = membase;
3628         bktp->bkt_nelem = state->id_num_lso_bufs;
3629         bktp->bkt_nfree = bktp->bkt_nelem;
3630 
3631         state->id_lso = bktp;
3632         mutex_exit(&state->id_lso_lock);
3633 
3634         return (DDI_SUCCESS);
3635 }
3636 
3637 /*
3638  * Statically allocate Tx buffer list(s).
3639  */
3640 static int
3641 ibd_init_txlist(ibd_state_t *state)
3642 {
3643         ibd_swqe_t *swqe;
3644         ibt_lkey_t lkey;
3645         int i;
3646         uint_t len;
3647         uint8_t *bufaddr;
3648 
3649         if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3650                 return (DDI_FAILURE);
3651 
3652         if (state->id_lso_policy && state->id_lso_capable) {
3653                 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3654                         state->id_lso_capable = B_FALSE;
3655         }
3656 
3657         mutex_enter(&state->id_tx_list.dl_mutex);
3658         state->id_tx_list.dl_head = NULL;
3659         state->id_tx_list.dl_pending_sends = B_FALSE;
3660         state->id_tx_list.dl_cnt = 0;
3661         mutex_exit(&state->id_tx_list.dl_mutex);
3662         mutex_enter(&state->id_tx_rel_list.dl_mutex);
3663         state->id_tx_rel_list.dl_head = NULL;
3664         state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3665         state->id_tx_rel_list.dl_cnt = 0;
3666         mutex_exit(&state->id_tx_rel_list.dl_mutex);
3667 
3668         /*
3669          * Allocate and setup the swqe list
3670          */
3671         lkey = state->id_tx_mr_desc.md_lkey;
3672         bufaddr = state->id_tx_bufs;
3673         len = state->id_tx_buf_sz;
3674         swqe = state->id_tx_wqes;
3675         mutex_enter(&state->id_tx_list.dl_mutex);
3676         for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3677                 swqe->swqe_next = NULL;
3678                 swqe->swqe_im_mblk = NULL;
3679 
3680                 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3681                     bufaddr;
3682                 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3683                 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3684 
3685                 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3686                 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3687                 swqe->w_swr.wr_trans = IBT_UD_SRV;
3688 
3689                 /* These are set in send */
3690                 swqe->w_swr.wr_nds = 0;
3691                 swqe->w_swr.wr_sgl = NULL;
3692                 swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3693 
3694                 /* add to list */
3695                 state->id_tx_list.dl_cnt++;
3696                 swqe->swqe_next = state->id_tx_list.dl_head;
3697                 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3698         }
3699         mutex_exit(&state->id_tx_list.dl_mutex);
3700 
3701         return (DDI_SUCCESS);
3702 }
3703 
3704 static int
3705 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3706     uint32_t *nds_p)
3707 {
3708         ibd_lsobkt_t *bktp;
3709         ibd_lsobuf_t *lbufp;
3710         ibd_lsobuf_t *nextp;
3711         ibt_lkey_t lso_lkey;
3712         uint_t frag_sz;
3713         uint_t num_needed;
3714         int i;
3715 
3716         ASSERT(sgl_p != NULL);
3717         ASSERT(nds_p != NULL);
3718         ASSERT(req_sz != 0);
3719 
3720         /*
3721          * Determine how many bufs we'd need for the size requested
3722          */
3723         num_needed = req_sz / IBD_LSO_BUFSZ;
3724         if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3725                 num_needed++;
3726 
3727         mutex_enter(&state->id_lso_lock);
3728 
3729         /*
3730          * If we don't have enough lso bufs, return failure
3731          */
3732         ASSERT(state->id_lso != NULL);
3733         bktp = state->id_lso;
3734         if (bktp->bkt_nfree < num_needed) {
3735                 mutex_exit(&state->id_lso_lock);
3736                 return (-1);
3737         }
3738 
3739         /*
3740          * Pick the first 'num_needed' bufs from the free list
3741          */
3742         lso_lkey = bktp->bkt_mr_desc.md_lkey;
3743         lbufp = bktp->bkt_free_head;
3744         for (i = 0; i < num_needed; i++) {
3745                 ASSERT(lbufp->lb_isfree != 0);
3746                 ASSERT(lbufp->lb_buf != NULL);
3747 
3748                 nextp = lbufp->lb_next;
3749 
3750                 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3751                 sgl_p[i].ds_key = lso_lkey;
3752                 sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3753 
3754                 lbufp->lb_isfree = 0;
3755                 lbufp->lb_next = NULL;
3756 
3757                 lbufp = nextp;
3758         }
3759         bktp->bkt_free_head = lbufp;
3760 
3761         /*
3762          * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3763          * to adjust the last sgl entry's length. Since we know we need atleast
3764          * one, the i-1 use below is ok.
3765          */
3766         if (frag_sz) {
3767                 sgl_p[i-1].ds_len = frag_sz;
3768         }
3769 
3770         /*
3771          * Update nfree count and return
3772          */
3773         bktp->bkt_nfree -= num_needed;
3774 
3775         mutex_exit(&state->id_lso_lock);
3776 
3777         *nds_p = num_needed;
3778 
3779         return (0);
3780 }
3781 
3782 static void
3783 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3784 {
3785         ibd_lsobkt_t *bktp;
3786         ibd_lsobuf_t *lbufp;
3787         uint8_t *lso_mem_end;
3788         uint_t ndx;
3789         int i;
3790 
3791         mutex_enter(&state->id_lso_lock);
3792 
3793         bktp = state->id_lso;
3794         ASSERT(bktp != NULL);
3795 
3796         lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3797         for (i = 0; i < nds; i++) {
3798                 uint8_t *va;
3799 
3800                 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3801                 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3802 
3803                 /*
3804                  * Figure out the buflist element this sgl buffer corresponds
3805                  * to and put it back at the head
3806                  */
3807                 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3808                 lbufp = bktp->bkt_bufl + ndx;
3809 
3810                 ASSERT(lbufp->lb_isfree == 0);
3811                 ASSERT(lbufp->lb_buf == va);
3812 
3813                 lbufp->lb_isfree = 1;
3814                 lbufp->lb_next = bktp->bkt_free_head;
3815                 bktp->bkt_free_head = lbufp;
3816         }
3817         bktp->bkt_nfree += nds;
3818 
3819         mutex_exit(&state->id_lso_lock);
3820 }
3821 
3822 static void
3823 ibd_free_tx_copybufs(ibd_state_t *state)
3824 {
3825         /*
3826          * Unregister txbuf mr
3827          */
3828         if (ibt_deregister_mr(state->id_hca_hdl,
3829             state->id_tx_mr_hdl) != IBT_SUCCESS) {
3830                 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3831         }
3832         state->id_tx_mr_hdl = NULL;
3833 
3834         /*
3835          * Free txbuf memory
3836          */
3837         kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3838             sizeof (ibd_swqe_t));
3839         kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3840             state->id_tx_buf_sz);
3841         state->id_tx_wqes = NULL;
3842         state->id_tx_bufs = NULL;
3843 }
3844 
3845 static void
3846 ibd_free_tx_lsobufs(ibd_state_t *state)
3847 {
3848         ibd_lsobkt_t *bktp;
3849 
3850         mutex_enter(&state->id_lso_lock);
3851 
3852         if ((bktp = state->id_lso) == NULL) {
3853                 mutex_exit(&state->id_lso_lock);
3854                 return;
3855         }
3856 
3857         /*
3858          * First, free the buflist
3859          */
3860         ASSERT(bktp->bkt_bufl != NULL);
3861         kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3862 
3863         /*
3864          * Unregister the LSO memory and free it
3865          */
3866         ASSERT(bktp->bkt_mr_hdl != NULL);
3867         if (ibt_deregister_mr(state->id_hca_hdl,
3868             bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3869                 DPRINT(10,
3870                     "ibd_free_lsobufs: ibt_deregister_mr failed");
3871         }
3872         ASSERT(bktp->bkt_mem);
3873         kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3874 
3875         /*
3876          * Finally free the bucket
3877          */
3878         kmem_free(bktp, sizeof (ibd_lsobkt_t));
3879         state->id_lso = NULL;
3880 
3881         mutex_exit(&state->id_lso_lock);
3882 }
3883 
3884 /*
3885  * Free the statically allocated Tx buffer list.
3886  */
3887 static void
3888 ibd_fini_txlist(ibd_state_t *state)
3889 {
3890         /*
3891          * Free the allocated swqes
3892          */
3893         mutex_enter(&state->id_tx_list.dl_mutex);
3894         mutex_enter(&state->id_tx_rel_list.dl_mutex);
3895         state->id_tx_list.dl_head = NULL;
3896         state->id_tx_list.dl_pending_sends = B_FALSE;
3897         state->id_tx_list.dl_cnt = 0;
3898         state->id_tx_rel_list.dl_head = NULL;
3899         state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3900         state->id_tx_rel_list.dl_cnt = 0;
3901         mutex_exit(&state->id_tx_rel_list.dl_mutex);
3902         mutex_exit(&state->id_tx_list.dl_mutex);
3903 
3904         ibd_free_tx_lsobufs(state);
3905         ibd_free_tx_copybufs(state);
3906 }
3907 
3908 /*
3909  * post a list of rwqes, NULL terminated.
3910  */
3911 static void
3912 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3913 {
3914         uint_t          i;
3915         uint_t          num_posted;
3916         ibt_status_t    ibt_status;
3917         ibt_recv_wr_t   wrs[IBD_RX_POST_CNT];
3918 
3919         while (rwqe) {
3920                 /* Post up to IBD_RX_POST_CNT receive work requests */
3921                 for (i = 0; i < IBD_RX_POST_CNT; i++) {
3922                         wrs[i] = rwqe->w_rwr;
3923                         rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3924                         if (rwqe == NULL) {
3925                                 i++;
3926                                 break;
3927                         }
3928                 }
3929 
3930                 /*
3931                  * If posting fails for some reason, we'll never receive
3932                  * completion intimation, so we'll need to cleanup. But
3933                  * we need to make sure we don't clean up nodes whose
3934                  * wrs have been successfully posted. We assume that the
3935                  * hca driver returns on the first failure to post and
3936                  * therefore the first 'num_posted' entries don't need
3937                  * cleanup here.
3938                  */
3939                 atomic_add_32(&state->id_rx_list.dl_cnt, i);
3940 
3941                 num_posted = 0;
3942                 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3943                     &num_posted);
3944                 if (ibt_status != IBT_SUCCESS) {
3945                         /* This cannot happen unless the device has an error. */
3946                         ibd_print_warn(state, "ibd_post_recv: FATAL: "
3947                             "posting multiple wrs failed: "
3948                             "requested=%d, done=%d, ret=%d",
3949                             IBD_RX_POST_CNT, num_posted, ibt_status);
3950                         atomic_add_32(&state->id_rx_list.dl_cnt,
3951                             num_posted - i);
3952                 }
3953         }
3954 }
3955 
3956 /*
3957  * Grab a list of rwqes from the array of lists, and post the list.
3958  */
3959 static void
3960 ibd_post_recv_intr(ibd_state_t *state)
3961 {
3962         ibd_rx_queue_t  *rxp;
3963         ibd_rwqe_t *list;
3964 
3965         /* rotate through the rx_queue array, expecting an adequate number */
3966         state->id_rx_post_queue_index =
3967             (state->id_rx_post_queue_index + 1) &
3968             (state->id_rx_nqueues - 1);
3969 
3970         rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3971         mutex_enter(&rxp->rx_post_lock);
3972         list = WQE_TO_RWQE(rxp->rx_head);
3973         rxp->rx_head = NULL;
3974         rxp->rx_cnt = 0;
3975         mutex_exit(&rxp->rx_post_lock);
3976         ibd_post_recv_list(state, list);
3977 }
3978 
3979 /* macro explained below */
3980 #define RX_QUEUE_HASH(rwqe) \
3981         (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3982 
3983 /*
3984  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3985  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3986  *
3987  * Note: one of 2^N lists is chosen via a hash.  This is done
3988  * because using one list is contentious.  If the first list is busy
3989  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3990  *
3991  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3992  * even distribution of mapping rwqes to the 2^N queues.
3993  */
3994 static void
3995 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3996 {
3997         ibd_rx_queue_t  *rxp;
3998 
3999         rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
4000 
4001         if (!mutex_tryenter(&rxp->rx_post_lock)) {
4002                 /* Failed.  Try a different queue ("ptr + 16" ensures that). */
4003                 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
4004                 mutex_enter(&rxp->rx_post_lock);
4005         }
4006         rwqe->rwqe_next = rxp->rx_head;
4007         if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4008                 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4009 
4010                 /* only call ibt_post_recv() every Nth time through here */
4011                 if ((active & (state->id_rx_nqueues - 1)) == 0) {
4012                         rxp->rx_head = NULL;
4013                         rxp->rx_cnt = 0;
4014                         mutex_exit(&rxp->rx_post_lock);
4015                         ibd_post_recv_list(state, rwqe);
4016                         return;
4017                 }
4018         }
4019         rxp->rx_head = RWQE_TO_WQE(rwqe);
4020         mutex_exit(&rxp->rx_post_lock);
4021 }
4022 
4023 static int
4024 ibd_alloc_rx_copybufs(ibd_state_t *state)
4025 {
4026         ibt_mr_attr_t mem_attr;
4027         int i;
4028 
4029         /*
4030          * Allocate one big chunk for all regular rx copy bufs
4031          */
4032         state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4033 
4034         state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4035             state->id_rx_buf_sz, KM_SLEEP);
4036 
4037         state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4038             sizeof (ibd_rwqe_t), KM_SLEEP);
4039 
4040         state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4041         state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4042             sizeof (ibd_rx_queue_t), KM_SLEEP);
4043         for (i = 0; i < state->id_rx_nqueues; i++) {
4044                 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4045                 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4046         }
4047 
4048         /*
4049          * Do one memory registration on the entire rxbuf area
4050          */
4051         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4052         mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4053         mem_attr.mr_as = NULL;
4054         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4055         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4056             &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4057                 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4058                 kmem_free(state->id_rx_wqes,
4059                     state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4060                 kmem_free(state->id_rx_bufs,
4061                     state->id_ud_num_rwqe * state->id_rx_buf_sz);
4062                 state->id_rx_bufs = NULL;
4063                 state->id_rx_wqes = NULL;
4064                 return (DDI_FAILURE);
4065         }
4066 
4067         return (DDI_SUCCESS);
4068 }
4069 
4070 /*
4071  * Allocate the statically allocated Rx buffer list.
4072  */
4073 static int
4074 ibd_init_rxlist(ibd_state_t *state)
4075 {
4076         ibd_rwqe_t *rwqe, *next;
4077         ibd_wqe_t *list;
4078         ibt_lkey_t lkey;
4079         int i;
4080         uint_t len;
4081         uint8_t *bufaddr;
4082 
4083         mutex_enter(&state->id_rx_free_list.dl_mutex);
4084         if (state->id_rx_free_list.dl_head != NULL) {
4085                 /* rx rsrcs were never freed.  Just repost them */
4086                 len = state->id_rx_buf_sz;
4087                 list = state->id_rx_free_list.dl_head;
4088                 state->id_rx_free_list.dl_head = NULL;
4089                 state->id_rx_free_list.dl_cnt = 0;
4090                 mutex_exit(&state->id_rx_free_list.dl_mutex);
4091                 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4092                     rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4093                         if ((rwqe->rwqe_im_mblk = desballoc(
4094                             rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4095                             &rwqe->w_freemsg_cb)) == NULL) {
4096                                 /* allow freemsg_cb to free the rwqes */
4097                                 if (atomic_dec_32_nv(&state->id_running) != 0) {
4098                                         cmn_err(CE_WARN, "ibd_init_rxlist: "
4099                                             "id_running was not 1\n");
4100                                 }
4101                                 DPRINT(10, "ibd_init_rxlist : "
4102                                     "failed in desballoc()");
4103                                 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4104                                     rwqe = next) {
4105                                         next = WQE_TO_RWQE(rwqe->rwqe_next);
4106                                         if (rwqe->rwqe_im_mblk) {
4107                                                 atomic_inc_32(&state->
4108                                                     id_rx_list.
4109                                                     dl_bufs_outstanding);
4110                                                 freemsg(rwqe->rwqe_im_mblk);
4111                                         } else
4112                                                 ibd_free_rwqe(state, rwqe);
4113                                 }
4114                                 atomic_inc_32(&state->id_running);
4115                                 return (DDI_FAILURE);
4116                         }
4117                 }
4118                 ibd_post_recv_list(state, WQE_TO_RWQE(list));
4119                 return (DDI_SUCCESS);
4120         }
4121         mutex_exit(&state->id_rx_free_list.dl_mutex);
4122 
4123         if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4124                 return (DDI_FAILURE);
4125 
4126         /*
4127          * Allocate and setup the rwqe list
4128          */
4129         len = state->id_rx_buf_sz;
4130         lkey = state->id_rx_mr_desc.md_lkey;
4131         rwqe = state->id_rx_wqes;
4132         bufaddr = state->id_rx_bufs;
4133         list = NULL;
4134         for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4135                 rwqe->w_state = state;
4136                 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4137                 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4138 
4139                 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4140 
4141                 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4142                     &rwqe->w_freemsg_cb)) == NULL) {
4143                         DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4144                         /* allow freemsg_cb to free the rwqes */
4145                         if (atomic_dec_32_nv(&state->id_running) != 0) {
4146                                 cmn_err(CE_WARN, "ibd_init_rxlist: "
4147                                     "id_running was not 1\n");
4148                         }
4149                         DPRINT(10, "ibd_init_rxlist : "
4150                             "failed in desballoc()");
4151                         for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4152                             rwqe = next) {
4153                                 next = WQE_TO_RWQE(rwqe->rwqe_next);
4154                                 freemsg(rwqe->rwqe_im_mblk);
4155                         }
4156                         atomic_inc_32(&state->id_running);
4157 
4158                         /* remove reference to free'd rwqes */
4159                         mutex_enter(&state->id_rx_free_list.dl_mutex);
4160                         state->id_rx_free_list.dl_head = NULL;
4161                         state->id_rx_free_list.dl_cnt = 0;
4162                         mutex_exit(&state->id_rx_free_list.dl_mutex);
4163 
4164                         ibd_fini_rxlist(state);
4165                         return (DDI_FAILURE);
4166                 }
4167 
4168                 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4169                 rwqe->rwqe_copybuf.ic_sgl.ds_va =
4170                     (ib_vaddr_t)(uintptr_t)bufaddr;
4171                 rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4172                 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4173                 rwqe->w_rwr.wr_nds = 1;
4174                 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4175 
4176                 rwqe->rwqe_next = list;
4177                 list = RWQE_TO_WQE(rwqe);
4178         }
4179         ibd_post_recv_list(state, WQE_TO_RWQE(list));
4180 
4181         return (DDI_SUCCESS);
4182 }
4183 
4184 static void
4185 ibd_free_rx_copybufs(ibd_state_t *state)
4186 {
4187         int i;
4188 
4189         /*
4190          * Unregister rxbuf mr
4191          */
4192         if (ibt_deregister_mr(state->id_hca_hdl,
4193             state->id_rx_mr_hdl) != IBT_SUCCESS) {
4194                 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4195         }
4196         state->id_rx_mr_hdl = NULL;
4197 
4198         /*
4199          * Free rxbuf memory
4200          */
4201         for (i = 0; i < state->id_rx_nqueues; i++) {
4202                 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4203                 mutex_destroy(&rxp->rx_post_lock);
4204         }
4205         kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4206             sizeof (ibd_rx_queue_t));
4207         kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4208             sizeof (ibd_rwqe_t));
4209         kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4210             state->id_rx_buf_sz);
4211         state->id_rx_queues = NULL;
4212         state->id_rx_wqes = NULL;
4213         state->id_rx_bufs = NULL;
4214 }
4215 
4216 static void
4217 ibd_free_rx_rsrcs(ibd_state_t *state)
4218 {
4219         mutex_enter(&state->id_rx_free_list.dl_mutex);
4220         if (state->id_rx_free_list.dl_head == NULL) {
4221                 /* already freed */
4222                 mutex_exit(&state->id_rx_free_list.dl_mutex);
4223                 return;
4224         }
4225         ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4226         ibd_free_rx_copybufs(state);
4227         state->id_rx_free_list.dl_cnt = 0;
4228         state->id_rx_free_list.dl_head = NULL;
4229         mutex_exit(&state->id_rx_free_list.dl_mutex);
4230 }
4231 
4232 /*
4233  * Free the statically allocated Rx buffer list.
4234  */
4235 static void
4236 ibd_fini_rxlist(ibd_state_t *state)
4237 {
4238         ibd_rwqe_t *rwqe;
4239         int i;
4240 
4241         /* run through the rx_queue's, calling freemsg() */
4242         for (i = 0; i < state->id_rx_nqueues; i++) {
4243                 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4244                 mutex_enter(&rxp->rx_post_lock);
4245                 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4246                     rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4247                         freemsg(rwqe->rwqe_im_mblk);
4248                         rxp->rx_cnt--;
4249                 }
4250                 rxp->rx_head = NULL;
4251                 mutex_exit(&rxp->rx_post_lock);
4252         }
4253 
4254         /* cannot free rx resources unless gld returned everything */
4255         if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4256                 ibd_free_rx_rsrcs(state);
4257 }
4258 
4259 /*
4260  * Free an allocated recv wqe.
4261  */
4262 /* ARGSUSED */
4263 static void
4264 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4265 {
4266         /*
4267          * desballoc() failed (no memory).
4268          *
4269          * This rwqe is placed on a free list so that it
4270          * can be reinstated when memory is available.
4271          *
4272          * NOTE: no code currently exists to reinstate
4273          * these "lost" rwqes.
4274          */
4275         mutex_enter(&state->id_rx_free_list.dl_mutex);
4276         state->id_rx_free_list.dl_cnt++;
4277         rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4278         state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4279         mutex_exit(&state->id_rx_free_list.dl_mutex);
4280 }
4281 
4282 /*
4283  * IBA Rx completion queue handler. Guaranteed to be single
4284  * threaded and nonreentrant for this CQ.
4285  */
4286 /* ARGSUSED */
4287 static void
4288 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4289 {
4290         ibd_state_t *state = (ibd_state_t *)arg;
4291 
4292         atomic_inc_64(&state->id_num_intrs);
4293 
4294         if (ibd_rx_softintr == 1) {
4295                 mutex_enter(&state->id_rcq_poll_lock);
4296                 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4297                         state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4298                         mutex_exit(&state->id_rcq_poll_lock);
4299                         return;
4300                 } else {
4301                         mutex_exit(&state->id_rcq_poll_lock);
4302                         ddi_trigger_softintr(state->id_rx);
4303                 }
4304         } else
4305                 (void) ibd_intr((caddr_t)state);
4306 }
4307 
4308 /*
4309  * CQ handler for Tx completions, when the Tx CQ is in
4310  * interrupt driven mode.
4311  */
4312 /* ARGSUSED */
4313 static void
4314 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4315 {
4316         ibd_state_t *state = (ibd_state_t *)arg;
4317 
4318         atomic_inc_64(&state->id_num_intrs);
4319 
4320         if (ibd_tx_softintr == 1) {
4321                 mutex_enter(&state->id_scq_poll_lock);
4322                 if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4323                         state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4324                         mutex_exit(&state->id_scq_poll_lock);
4325                         return;
4326                 } else {
4327                         mutex_exit(&state->id_scq_poll_lock);
4328                         ddi_trigger_softintr(state->id_tx);
4329                 }
4330         } else
4331                 (void) ibd_tx_recycle((caddr_t)state);
4332 }
4333 
4334 /*
4335  * Multicast group create/delete trap handler. These will be delivered
4336  * on a kernel thread (handling can thus block) and can be invoked
4337  * concurrently. The handler can be invoked anytime after it is
4338  * registered and before ibt_detach().
4339  */
4340 /* ARGSUSED */
4341 static void
4342 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4343     ibt_subnet_event_t *event)
4344 {
4345         ibd_state_t *state = (ibd_state_t *)arg;
4346         ibd_req_t *req;
4347 
4348         /*
4349          * The trap handler will get invoked once for every event for
4350          * every port. The input "gid" is the GID0 of the port the
4351          * trap came in on; we just need to act on traps that came
4352          * to our port, meaning the port on which the ipoib interface
4353          * resides. Since ipoib uses GID0 of the port, we just match
4354          * the gids to check whether we need to handle the trap.
4355          */
4356         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4357         if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4358                 return;
4359         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4360 
4361         DPRINT(10, "ibd_notices_handler : %d\n", code);
4362 
4363         switch (code) {
4364                 case IBT_SM_EVENT_UNAVAILABLE:
4365                         /*
4366                          * If we are in promiscuous mode or have
4367                          * sendnonmembers, we need to print a warning
4368                          * message right now. Else, just store the
4369                          * information, print when we enter promiscuous
4370                          * mode or attempt nonmember send. We might
4371                          * also want to stop caching sendnonmember.
4372                          */
4373                         ibd_print_warn(state, "IBA multicast support "
4374                             "degraded due to unavailability of multicast "
4375                             "traps");
4376                         break;
4377                 case IBT_SM_EVENT_AVAILABLE:
4378                         /*
4379                          * If we printed a warning message above or
4380                          * while trying to nonmember send or get into
4381                          * promiscuous mode, print an okay message.
4382                          */
4383                         ibd_print_warn(state, "IBA multicast support "
4384                             "restored due to availability of multicast "
4385                             "traps");
4386                         break;
4387                 case IBT_SM_EVENT_MCG_CREATED:
4388                 case IBT_SM_EVENT_MCG_DELETED:
4389                         /*
4390                          * If it is a "deleted" event and we are in late hca
4391                          * init, nothing to do.
4392                          */
4393                         if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4394                             IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4395                             IBT_SM_EVENT_MCG_DELETED)) {
4396                                 break;
4397                         }
4398                         /*
4399                          * Common processing of creation/deletion traps.
4400                          * First check if the instance is being
4401                          * [de]initialized; back off then, without doing
4402                          * anything more, since we are not sure if the
4403                          * async thread is around, or whether we might
4404                          * be racing with the detach code in ibd_m_stop()
4405                          * that scans the mcg list.
4406                          */
4407                         if (!ibd_async_safe(state))
4408                                 return;
4409 
4410                         req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4411                         req->rq_gid = event->sm_notice_gid;
4412                         req->rq_ptr = (void *)code;
4413                         ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4414                         break;
4415         }
4416 }
4417 
4418 static void
4419 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4420 {
4421         ib_gid_t mgid = req->rq_gid;
4422         ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4423         int ret;
4424         ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4425 
4426         DPRINT(10, "ibd_async_trap : %d\n", code);
4427 
4428         /*
4429          * Check if we have already joined the IPoIB broadcast group for our
4430          * PKEY. If joined, perform the rest of the operation.
4431          * Else, the interface is not initialised. Do the initialisation here
4432          * by calling ibd_start() and return.
4433          */
4434 
4435         if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4436             IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4437             (code == IBT_SM_EVENT_MCG_CREATED)) {
4438                 /*
4439                  * If we are in late HCA init and a notification for the
4440                  * creation of a MCG came in, check if it is the IPoIB MCG for
4441                  * this pkey. If not, return.
4442                  */
4443                 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4444                     state->id_pkey)) {
4445                         ibd_async_done(state);
4446                         return;
4447                 }
4448                 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4449                 /*
4450                  * Check if there is still a necessity to start the interface.
4451                  * It is possible that the user attempted unplumb at just about
4452                  * the same time, and if unplumb succeeded, we have nothing to
4453                  * do.
4454                  */
4455                 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4456                     IBD_DRV_IN_LATE_HCA_INIT) &&
4457                     ((ret = ibd_start(state)) != 0)) {
4458                         DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4459                             "init, ret=%d", ret);
4460                 }
4461                 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4462                 ibd_async_done(state);
4463                 return;
4464         }
4465 
4466         /*
4467          * Atomically search the nonmember and sendonlymember lists and
4468          * delete.
4469          */
4470         ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4471 
4472         if (state->id_prom_op == IBD_OP_COMPLETED) {
4473                 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4474 
4475                 /*
4476                  * If in promiscuous mode, try to join/attach to the new
4477                  * mcg. Given the unreliable out-of-order mode of trap
4478                  * delivery, we can never be sure whether it is a problem
4479                  * if the join fails. Thus, we warn the admin of a failure
4480                  * if this was a creation trap. Note that the trap might
4481                  * actually be reporting a long past event, and the mcg
4482                  * might already have been deleted, thus we might be warning
4483                  * in vain.
4484                  */
4485                 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4486                     NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4487                         ibd_print_warn(state, "IBA promiscuous mode missed "
4488                             "new multicast gid %016llx:%016llx",
4489                             (u_longlong_t)mgid.gid_prefix,
4490                             (u_longlong_t)mgid.gid_guid);
4491         }
4492 
4493         /*
4494          * Free the request slot allocated by the subnet event thread.
4495          */
4496         ibd_async_done(state);
4497 }
4498 
4499 /*
4500  * GLDv3 entry point to get capabilities.
4501  */
4502 static boolean_t
4503 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4504 {
4505         ibd_state_t *state = arg;
4506 
4507         if (state->id_type == IBD_PORT_DRIVER)
4508                 return (B_FALSE);
4509 
4510         switch (cap) {
4511         case MAC_CAPAB_HCKSUM: {
4512                 uint32_t *txflags = cap_data;
4513 
4514                 /*
4515                  * We either do full checksum or not do it at all
4516                  */
4517                 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4518                         *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4519                 else
4520                         return (B_FALSE);
4521                 break;
4522         }
4523 
4524         case MAC_CAPAB_LSO: {
4525                 mac_capab_lso_t *cap_lso = cap_data;
4526 
4527                 /*
4528                  * In addition to the capability and policy, since LSO
4529                  * relies on hw checksum, we'll not enable LSO if we
4530                  * don't have hw checksum.  Of course, if the HCA doesn't
4531                  * provide the reserved lkey capability, enabling LSO will
4532                  * actually affect performance adversely, so we'll disable
4533                  * LSO even for that case.
4534                  */
4535                 if (!state->id_lso_policy || !state->id_lso_capable)
4536                         return (B_FALSE);
4537 
4538                 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4539                         return (B_FALSE);
4540 
4541                 if (state->id_hca_res_lkey_capab == 0) {
4542                         ibd_print_warn(state, "no reserved-lkey capability, "
4543                             "disabling LSO");
4544                         return (B_FALSE);
4545                 }
4546 
4547                 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4548                 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4549                 break;
4550         }
4551 
4552         default:
4553                 return (B_FALSE);
4554         }
4555 
4556         return (B_TRUE);
4557 }
4558 
4559 /*
4560  * callback function for set/get of properties
4561  */
4562 static int
4563 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4564     uint_t pr_valsize, const void *pr_val)
4565 {
4566         ibd_state_t *state = arg;
4567         int err = 0;
4568         uint32_t link_mode;
4569 
4570         /* Cannot set properties on a port driver */
4571         if (state->id_type == IBD_PORT_DRIVER) {
4572                 return (ENOTSUP);
4573         }
4574 
4575         switch (pr_num) {
4576                 case MAC_PROP_IB_LINKMODE:
4577                         if (state->id_mac_state & IBD_DRV_STARTED) {
4578                                 err = EBUSY;
4579                                 break;
4580                         }
4581                         if (pr_val == NULL) {
4582                                 err = EINVAL;
4583                                 break;
4584                         }
4585                         bcopy(pr_val, &link_mode, sizeof (link_mode));
4586                         if (link_mode != IBD_LINK_MODE_UD &&
4587                             link_mode != IBD_LINK_MODE_RC) {
4588                                 err = EINVAL;
4589                         } else {
4590                                 if (link_mode == IBD_LINK_MODE_RC) {
4591                                         if (state->id_enable_rc) {
4592                                                 return (0);
4593                                         }
4594                                         state->id_enable_rc = 1;
4595                                         /* inform MAC framework of new MTU */
4596                                         err = mac_maxsdu_update2(state->id_mh,
4597                                             state->rc_mtu - IPOIB_HDRSIZE,
4598                                             state->id_mtu - IPOIB_HDRSIZE);
4599                                 } else {
4600                                         if (!state->id_enable_rc) {
4601                                                 return (0);
4602                                         }
4603                                         state->id_enable_rc = 0;
4604                                         err = mac_maxsdu_update2(state->id_mh,
4605                                             state->id_mtu - IPOIB_HDRSIZE,
4606                                             state->id_mtu - IPOIB_HDRSIZE);
4607                                 }
4608                                 (void) ibd_record_capab(state);
4609                                 mac_capab_update(state->id_mh);
4610                         }
4611                         break;
4612                 case MAC_PROP_PRIVATE:
4613                         err = ibd_set_priv_prop(state, pr_name,
4614                             pr_valsize, pr_val);
4615                         break;
4616                 default:
4617                         err = ENOTSUP;
4618                         break;
4619         }
4620         return (err);
4621 }
4622 
4623 static int
4624 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4625     uint_t pr_valsize, void *pr_val)
4626 {
4627         ibd_state_t *state = arg;
4628         int err = 0;
4629 
4630         switch (pr_num) {
4631                 case MAC_PROP_MTU:
4632                         break;
4633                 default:
4634                         if (state->id_type == IBD_PORT_DRIVER) {
4635                                 return (ENOTSUP);
4636                         }
4637                         break;
4638         }
4639 
4640         switch (pr_num) {
4641                 case MAC_PROP_IB_LINKMODE:
4642                         *(uint_t *)pr_val = state->id_enable_rc;
4643                         break;
4644                 case MAC_PROP_PRIVATE:
4645                         err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4646                             pr_val);
4647                         break;
4648                 default:
4649                         err = ENOTSUP;
4650                         break;
4651         }
4652         return (err);
4653 }
4654 
4655 static void
4656 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4657     mac_prop_info_handle_t prh)
4658 {
4659         ibd_state_t *state = arg;
4660 
4661         switch (pr_num) {
4662         case MAC_PROP_IB_LINKMODE: {
4663                 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4664                 break;
4665         }
4666         case MAC_PROP_MTU: {
4667                 uint32_t min, max;
4668                 if (state->id_type == IBD_PORT_DRIVER) {
4669                         min = 1500;
4670                         max = IBD_DEF_RC_MAX_SDU;
4671                 } else if (state->id_enable_rc) {
4672                         min = max = IBD_DEF_RC_MAX_SDU;
4673                 } else {
4674                         min = max = state->id_mtu - IPOIB_HDRSIZE;
4675                 }
4676                 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4677                 mac_prop_info_set_range_uint32(prh, min, max);
4678                 break;
4679         }
4680         case MAC_PROP_PRIVATE: {
4681                 char valstr[64];
4682                 int value;
4683 
4684                 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4685                         mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4686                         return;
4687                 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4688                         value = IBD_DEF_COALESCE_COMPLETIONS;
4689                 } else if (strcmp(pr_name,
4690                     "_ibd_create_broadcast_group") == 0) {
4691                         value = IBD_DEF_CREATE_BCAST_GROUP;
4692                 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4693                         value = IBD_DEF_HASH_SIZE;
4694                 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4695                         value = IBD_DEF_LSO_POLICY;
4696                 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4697                         value = IBD_DEF_NUM_AH;
4698                 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4699                         value = IBD_DEF_NUM_LSO_BUFS;
4700                 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4701                         value = IBD_DEF_RC_ENABLE_SRQ;
4702                 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4703                         value = IBD_DEF_RC_NUM_RWQE;
4704                 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4705                         value = IBD_DEF_RC_NUM_SRQ;
4706                 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4707                         value = IBD_DEF_RC_NUM_SWQE;
4708                 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4709                         value = IBD_DEF_RC_RX_COMP_COUNT;
4710                 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4711                         value = IBD_DEF_RC_RX_COMP_USEC;
4712                 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4713                         value = IBD_DEF_RC_RX_COPY_THRESH;
4714                 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4715                         value = IBD_DEF_RC_RX_RWQE_THRESH;
4716                 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4717                         value = IBD_DEF_RC_TX_COMP_COUNT;
4718                 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4719                         value = IBD_DEF_RC_TX_COMP_USEC;
4720                 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4721                         value = IBD_DEF_RC_TX_COPY_THRESH;
4722                 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4723                         value = IBD_DEF_UD_NUM_RWQE;
4724                 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4725                         value = IBD_DEF_UD_NUM_SWQE;
4726                 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4727                         value = IBD_DEF_UD_RX_COMP_COUNT;
4728                 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4729                         value = IBD_DEF_UD_RX_COMP_USEC;
4730                 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4731                         value = IBD_DEF_UD_TX_COMP_COUNT;
4732                 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4733                         value = IBD_DEF_UD_TX_COMP_USEC;
4734                 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4735                         value = IBD_DEF_UD_TX_COPY_THRESH;
4736                 } else {
4737                         return;
4738                 }
4739 
4740                 (void) snprintf(valstr, sizeof (valstr), "%d", value);
4741                 mac_prop_info_set_default_str(prh, valstr);
4742                 break;
4743         }
4744         } /* switch (pr_num) */
4745 }
4746 
4747 /* ARGSUSED2 */
4748 static int
4749 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4750     uint_t pr_valsize, const void *pr_val)
4751 {
4752         int err = 0;
4753         long result;
4754 
4755         if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4756                 if (pr_val == NULL) {
4757                         return (EINVAL);
4758                 }
4759                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4760                 if (result < 0 || result > 1) {
4761                         err = EINVAL;
4762                 } else {
4763                         state->id_allow_coalesce_comp_tuning = (result == 1) ?
4764                             B_TRUE: B_FALSE;
4765                 }
4766                 return (err);
4767         }
4768         if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4769                 if (state->id_mac_state & IBD_DRV_STARTED) {
4770                         return (EBUSY);
4771                 }
4772                 if (pr_val == NULL) {
4773                         return (EINVAL);
4774                 }
4775                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4776                 if (result < 0 || result > 1) {
4777                         err = EINVAL;
4778                 } else {
4779                         state->id_create_broadcast_group = (result == 1) ?
4780                             B_TRUE: B_FALSE;
4781                 }
4782                 return (err);
4783         }
4784         if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4785                 if (state->id_mac_state & IBD_DRV_STARTED) {
4786                         return (EBUSY);
4787                 }
4788                 if (pr_val == NULL) {
4789                         return (EINVAL);
4790                 }
4791                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4792                 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4793                         err = EINVAL;
4794                 } else {
4795                         state->id_hash_size = (uint32_t)result;
4796                 }
4797                 return (err);
4798         }
4799         if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4800                 if (state->id_mac_state & IBD_DRV_STARTED) {
4801                         return (EBUSY);
4802                 }
4803                 if (pr_val == NULL) {
4804                         return (EINVAL);
4805                 }
4806                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4807                 if (result < 0 || result > 1) {
4808                         err = EINVAL;
4809                 } else {
4810                         state->id_lso_policy = (result == 1) ?
4811                             B_TRUE: B_FALSE;
4812                 }
4813                 mac_capab_update(state->id_mh);
4814                 return (err);
4815         }
4816         if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4817                 if (state->id_mac_state & IBD_DRV_STARTED) {
4818                         return (EBUSY);
4819                 }
4820                 if (pr_val == NULL) {
4821                         return (EINVAL);
4822                 }
4823                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4824                 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4825                         err = EINVAL;
4826                 } else {
4827                         state->id_num_ah = (uint32_t)result;
4828                 }
4829                 return (err);
4830         }
4831         if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4832                 if (state->id_mac_state & IBD_DRV_STARTED) {
4833                         return (EBUSY);
4834                 }
4835                 if (!state->id_lso_policy || !state->id_lso_capable) {
4836                         return (EINVAL);
4837                 }
4838                 if (pr_val == NULL) {
4839                         return (EINVAL);
4840                 }
4841                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4842                 if (result < IBD_MIN_NUM_LSO_BUFS ||
4843                     result > IBD_MAX_NUM_LSO_BUFS) {
4844                         err = EINVAL;
4845                 } else {
4846                         state->id_num_lso_bufs = (uint32_t)result;
4847                 }
4848                 return (err);
4849         }
4850         if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4851                 if (state->id_mac_state & IBD_DRV_STARTED) {
4852                         return (EBUSY);
4853                 }
4854                 if (pr_val == NULL) {
4855                         return (EINVAL);
4856                 }
4857                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4858                 if (result < 0 || result > 1) {
4859                         err = EINVAL;
4860                 } else {
4861                         state->rc_enable_srq = (result == 1) ?
4862                             B_TRUE: B_FALSE;
4863                 }
4864                 if (!state->rc_enable_srq) {
4865                         state->id_rc_num_srq = 0;
4866                 }
4867                 return (err);
4868         }
4869         if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4870                 if (state->id_mac_state & IBD_DRV_STARTED) {
4871                         return (EBUSY);
4872                 }
4873                 if (pr_val == NULL) {
4874                         return (EINVAL);
4875                 }
4876                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4877                 if (result < IBD_MIN_RC_NUM_RWQE ||
4878                     result > IBD_MAX_RC_NUM_RWQE) {
4879                         err = EINVAL;
4880                 } else {
4881                         state->id_rc_num_rwqe = (uint32_t)result;
4882                         if (state->id_allow_coalesce_comp_tuning &&
4883                             state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4884                                 state->id_rc_rx_comp_count =
4885                                     state->id_rc_num_rwqe;
4886                         if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4887                                 state->id_rc_num_srq =
4888                                     state->id_rc_num_rwqe - 1;
4889                         /*
4890                          * If rx_rwqe_threshold is greater than the number of
4891                          * rwqes, pull it back to 25% of number of rwqes.
4892                          */
4893                         if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4894                                 state->id_rc_rx_rwqe_thresh =
4895                                     (state->id_rc_num_rwqe >> 2);
4896 
4897                 }
4898                 return (err);
4899         }
4900         if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4901                 if (state->id_mac_state & IBD_DRV_STARTED) {
4902                         return (EBUSY);
4903                 }
4904                 if (pr_val == NULL) {
4905                         return (EINVAL);
4906                 }
4907                 if (!state->rc_enable_srq)
4908                         return (EINVAL);
4909 
4910                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4911                 if (result < IBD_MIN_RC_NUM_SRQ ||
4912                     result >= state->id_rc_num_rwqe) {
4913                         err = EINVAL;
4914                 } else
4915                         state->id_rc_num_srq = (uint32_t)result;
4916                 return (err);
4917         }
4918         if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4919                 if (state->id_mac_state & IBD_DRV_STARTED) {
4920                         return (EBUSY);
4921                 }
4922                 if (pr_val == NULL) {
4923                         return (EINVAL);
4924                 }
4925                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4926                 if (result < IBD_MIN_RC_NUM_SWQE ||
4927                     result > IBD_MAX_RC_NUM_SWQE) {
4928                         err = EINVAL;
4929                 } else {
4930                         state->id_rc_num_swqe = (uint32_t)result;
4931                         if (state->id_allow_coalesce_comp_tuning &&
4932                             state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4933                                 state->id_rc_tx_comp_count =
4934                                     state->id_rc_num_swqe;
4935                 }
4936                 return (err);
4937         }
4938         if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4939                 if (!state->id_allow_coalesce_comp_tuning) {
4940                         return (ENOTSUP);
4941                 }
4942                 if (pr_val == NULL) {
4943                         return (EINVAL);
4944                 }
4945                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4946                 if (result < 1 || result > state->id_rc_num_rwqe) {
4947                         err = EINVAL;
4948                 } else {
4949                         state->id_rc_rx_comp_count = (uint32_t)result;
4950                 }
4951                 return (err);
4952         }
4953         if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4954                 if (!state->id_allow_coalesce_comp_tuning) {
4955                         return (ENOTSUP);
4956                 }
4957                 if (pr_val == NULL) {
4958                         return (EINVAL);
4959                 }
4960                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4961                 if (result < 1) {
4962                         err = EINVAL;
4963                 } else {
4964                         state->id_rc_rx_comp_usec = (uint32_t)result;
4965                 }
4966                 return (err);
4967         }
4968         if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4969                 if (state->id_mac_state & IBD_DRV_STARTED) {
4970                         return (EBUSY);
4971                 }
4972                 if (pr_val == NULL) {
4973                         return (EINVAL);
4974                 }
4975                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4976                 if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4977                     result > state->rc_mtu) {
4978                         err = EINVAL;
4979                 } else {
4980                         state->id_rc_rx_copy_thresh = (uint32_t)result;
4981                 }
4982                 return (err);
4983         }
4984         if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4985                 if (state->id_mac_state & IBD_DRV_STARTED) {
4986                         return (EBUSY);
4987                 }
4988                 if (pr_val == NULL) {
4989                         return (EINVAL);
4990                 }
4991                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4992                 if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4993                     result >= state->id_rc_num_rwqe) {
4994                         err = EINVAL;
4995                 } else {
4996                         state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4997                 }
4998                 return (err);
4999         }
5000         if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5001                 if (!state->id_allow_coalesce_comp_tuning) {
5002                         return (ENOTSUP);
5003                 }
5004                 if (pr_val == NULL) {
5005                         return (EINVAL);
5006                 }
5007                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5008                 if (result < 1 || result > state->id_rc_num_swqe) {
5009                         err = EINVAL;
5010                 } else {
5011                         state->id_rc_tx_comp_count = (uint32_t)result;
5012                 }
5013                 return (err);
5014         }
5015         if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5016                 if (!state->id_allow_coalesce_comp_tuning) {
5017                         return (ENOTSUP);
5018                 }
5019                 if (pr_val == NULL) {
5020                         return (EINVAL);
5021                 }
5022                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5023                 if (result < 1)
5024                         err = EINVAL;
5025                 else {
5026                         state->id_rc_tx_comp_usec = (uint32_t)result;
5027                 }
5028                 return (err);
5029         }
5030         if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5031                 if (state->id_mac_state & IBD_DRV_STARTED) {
5032                         return (EBUSY);
5033                 }
5034                 if (pr_val == NULL) {
5035                         return (EINVAL);
5036                 }
5037                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5038                 if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5039                     result > state->rc_mtu) {
5040                         err = EINVAL;
5041                 } else {
5042                         state->id_rc_tx_copy_thresh = (uint32_t)result;
5043                 }
5044                 return (err);
5045         }
5046         if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5047                 if (state->id_mac_state & IBD_DRV_STARTED) {
5048                         return (EBUSY);
5049                 }
5050                 if (pr_val == NULL) {
5051                         return (EINVAL);
5052                 }
5053                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5054                 if (result < IBD_MIN_UD_NUM_RWQE ||
5055                     result > IBD_MAX_UD_NUM_RWQE) {
5056                         err = EINVAL;
5057                 } else {
5058                         if (result > state->id_hca_max_chan_sz) {
5059                                 state->id_ud_num_rwqe =
5060                                     state->id_hca_max_chan_sz;
5061                         } else {
5062                                 state->id_ud_num_rwqe = (uint32_t)result;
5063                         }
5064                         if (state->id_allow_coalesce_comp_tuning &&
5065                             state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5066                                 state->id_ud_rx_comp_count =
5067                                     state->id_ud_num_rwqe;
5068                 }
5069                 return (err);
5070         }
5071         if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5072                 if (state->id_mac_state & IBD_DRV_STARTED) {
5073                         return (EBUSY);
5074                 }
5075                 if (pr_val == NULL) {
5076                         return (EINVAL);
5077                 }
5078                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5079                 if (result < IBD_MIN_UD_NUM_SWQE ||
5080                     result > IBD_MAX_UD_NUM_SWQE) {
5081                         err = EINVAL;
5082                 } else {
5083                         if (result > state->id_hca_max_chan_sz) {
5084                                 state->id_ud_num_swqe =
5085                                     state->id_hca_max_chan_sz;
5086                         } else {
5087                                 state->id_ud_num_swqe = (uint32_t)result;
5088                         }
5089                         if (state->id_allow_coalesce_comp_tuning &&
5090                             state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5091                                 state->id_ud_tx_comp_count =
5092                                     state->id_ud_num_swqe;
5093                 }
5094                 return (err);
5095         }
5096         if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5097                 if (!state->id_allow_coalesce_comp_tuning) {
5098                         return (ENOTSUP);
5099                 }
5100                 if (pr_val == NULL) {
5101                         return (EINVAL);
5102                 }
5103                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5104                 if (result < 1 || result > state->id_ud_num_rwqe) {
5105                         err = EINVAL;
5106                 } else {
5107                         state->id_ud_rx_comp_count = (uint32_t)result;
5108                 }
5109                 return (err);
5110         }
5111         if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5112                 if (!state->id_allow_coalesce_comp_tuning) {
5113                         return (ENOTSUP);
5114                 }
5115                 if (pr_val == NULL) {
5116                         return (EINVAL);
5117                 }
5118                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5119                 if (result < 1) {
5120                         err = EINVAL;
5121                 } else {
5122                         state->id_ud_rx_comp_usec = (uint32_t)result;
5123                 }
5124                 return (err);
5125         }
5126         if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5127                 if (!state->id_allow_coalesce_comp_tuning) {
5128                         return (ENOTSUP);
5129                 }
5130                 if (pr_val == NULL) {
5131                         return (EINVAL);
5132                 }
5133                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5134                 if (result < 1 || result > state->id_ud_num_swqe) {
5135                         err = EINVAL;
5136                 } else {
5137                         state->id_ud_tx_comp_count = (uint32_t)result;
5138                 }
5139                 return (err);
5140         }
5141         if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5142                 if (!state->id_allow_coalesce_comp_tuning) {
5143                         return (ENOTSUP);
5144                 }
5145                 if (pr_val == NULL) {
5146                         return (EINVAL);
5147                 }
5148                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5149                 if (result < 1) {
5150                         err = EINVAL;
5151                 } else {
5152                         state->id_ud_tx_comp_usec = (uint32_t)result;
5153                 }
5154                 return (err);
5155         }
5156         if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5157                 if (state->id_mac_state & IBD_DRV_STARTED) {
5158                         return (EBUSY);
5159                 }
5160                 if (pr_val == NULL) {
5161                         return (EINVAL);
5162                 }
5163                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5164                 if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5165                     result > IBD_MAX_UD_TX_COPY_THRESH) {
5166                         err = EINVAL;
5167                 } else {
5168                         state->id_ud_tx_copy_thresh = (uint32_t)result;
5169                 }
5170                 return (err);
5171         }
5172         return (ENOTSUP);
5173 }
5174 
5175 static int
5176 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5177     void *pr_val)
5178 {
5179         int err = ENOTSUP;
5180         int value;
5181 
5182         if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5183                 value = state->id_bgroup_present;
5184                 err = 0;
5185                 goto done;
5186         }
5187         if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5188                 value = state->id_allow_coalesce_comp_tuning;
5189                 err = 0;
5190                 goto done;
5191         }
5192         if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5193                 value = state->id_create_broadcast_group;
5194                 err = 0;
5195                 goto done;
5196         }
5197         if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5198                 value = state->id_hash_size;
5199                 err = 0;
5200                 goto done;
5201         }
5202         if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5203                 value = state->id_lso_policy;
5204                 err = 0;
5205                 goto done;
5206         }
5207         if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5208                 value = state->id_num_ah;
5209                 err = 0;
5210                 goto done;
5211         }
5212         if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5213                 value = state->id_num_lso_bufs;
5214                 err = 0;
5215                 goto done;
5216         }
5217         if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5218                 value = state->rc_enable_srq;
5219                 err = 0;
5220                 goto done;
5221         }
5222         if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5223                 value = state->id_rc_num_rwqe;
5224                 err = 0;
5225                 goto done;
5226         }
5227         if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5228                 value = state->id_rc_num_srq;
5229                 err = 0;
5230                 goto done;
5231         }
5232         if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5233                 value = state->id_rc_num_swqe;
5234                 err = 0;
5235                 goto done;
5236         }
5237         if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5238                 value = state->id_rc_rx_comp_count;
5239                 err = 0;
5240                 goto done;
5241         }
5242         if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5243                 value = state->id_rc_rx_comp_usec;
5244                 err = 0;
5245                 goto done;
5246         }
5247         if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5248                 value = state->id_rc_rx_copy_thresh;
5249                 err = 0;
5250                 goto done;
5251         }
5252         if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5253                 value = state->id_rc_rx_rwqe_thresh;
5254                 err = 0;
5255                 goto done;
5256         }
5257         if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5258                 value = state->id_rc_tx_comp_count;
5259                 err = 0;
5260                 goto done;
5261         }
5262         if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5263                 value = state->id_rc_tx_comp_usec;
5264                 err = 0;
5265                 goto done;
5266         }
5267         if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5268                 value = state->id_rc_tx_copy_thresh;
5269                 err = 0;
5270                 goto done;
5271         }
5272         if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5273                 value = state->id_ud_num_rwqe;
5274                 err = 0;
5275                 goto done;
5276         }
5277         if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5278                 value = state->id_ud_num_swqe;
5279                 err = 0;
5280                 goto done;
5281         }
5282         if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5283                 value = state->id_ud_rx_comp_count;
5284                 err = 0;
5285                 goto done;
5286         }
5287         if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5288                 value = state->id_ud_rx_comp_usec;
5289                 err = 0;
5290                 goto done;
5291         }
5292         if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5293                 value = state->id_ud_tx_comp_count;
5294                 err = 0;
5295                 goto done;
5296         }
5297         if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5298                 value = state->id_ud_tx_comp_usec;
5299                 err = 0;
5300                 goto done;
5301         }
5302         if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5303                 value = state->id_ud_tx_copy_thresh;
5304                 err = 0;
5305                 goto done;
5306         }
5307 done:
5308         if (err == 0) {
5309                 (void) snprintf(pr_val, pr_valsize, "%d", value);
5310         }
5311         return (err);
5312 }
5313 
5314 static int
5315 ibd_get_port_details(ibd_state_t *state)
5316 {
5317         ibt_hca_portinfo_t *port_infop;
5318         ibt_status_t ret;
5319         uint_t psize, port_infosz;
5320 
5321         mutex_enter(&state->id_link_mutex);
5322 
5323         /*
5324          * Query for port information
5325          */
5326         ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5327             &port_infop, &psize, &port_infosz);
5328         if ((ret != IBT_SUCCESS) || (psize != 1)) {
5329                 mutex_exit(&state->id_link_mutex);
5330                 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5331                     "failed, ret=%d", ret);
5332                 return (ENETDOWN);
5333         }
5334 
5335         /*
5336          * If the link is active, verify the pkey
5337          */
5338         if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5339                 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5340                     state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5341                         state->id_link_state = LINK_STATE_DOWN;
5342                 } else {
5343                         state->id_link_state = LINK_STATE_UP;
5344                 }
5345                 state->id_mtu = (128 << port_infop->p_mtu);
5346                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5347                 state->id_sgid = *port_infop->p_sgid_tbl;
5348                 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5349                 /*
5350                  * Now that the port is active, record the port speed
5351                  */
5352                 state->id_link_speed = ibd_get_portspeed(state);
5353         } else {
5354                 /* Make sure that these are handled in PORT_UP/CHANGE */
5355                 state->id_mtu = 0;
5356                 state->id_link_state = LINK_STATE_DOWN;
5357                 state->id_link_speed = 0;
5358         }
5359         mutex_exit(&state->id_link_mutex);
5360         ibt_free_portinfo(port_infop, port_infosz);
5361 
5362         return (0);
5363 }
5364 
5365 static int
5366 ibd_alloc_cqs(ibd_state_t *state)
5367 {
5368         ibt_hca_attr_t hca_attrs;
5369         ibt_cq_attr_t cq_attr;
5370         ibt_status_t ret;
5371         uint32_t real_size;
5372         uint_t num_rwqe_change = 0;
5373         uint_t num_swqe_change = 0;
5374 
5375         ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5376         ASSERT(ret == IBT_SUCCESS);
5377 
5378         /*
5379          * Allocate Rx/combined CQ:
5380          * Theoretically, there is no point in having more than #rwqe
5381          * plus #swqe cqe's, except that the CQ will be signaled for
5382          * overflow when the last wqe completes, if none of the previous
5383          * cqe's have been polled. Thus, we allocate just a few less wqe's
5384          * to make sure such overflow does not occur.
5385          */
5386         cq_attr.cq_sched = NULL;
5387         cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5388 
5389         /*
5390          * Allocate Receive CQ.
5391          */
5392         if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5393                 cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5394         } else {
5395                 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5396                 num_rwqe_change = state->id_ud_num_rwqe;
5397                 state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5398         }
5399 
5400         if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5401             &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5402                 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5403                     "failed, ret=%d\n", ret);
5404                 return (DDI_FAILURE);
5405         }
5406 
5407         if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5408             state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5409                 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5410                     "moderation failed, ret=%d\n", ret);
5411         }
5412 
5413         /* make the #rx wc's the same as max rx chain size */
5414         state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5415         state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5416             state->id_rxwcs_size, KM_SLEEP);
5417 
5418         /*
5419          * Allocate Send CQ.
5420          */
5421         if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5422                 cq_attr.cq_size = state->id_ud_num_swqe + 1;
5423         } else {
5424                 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5425                 num_swqe_change = state->id_ud_num_swqe;
5426                 state->id_ud_num_swqe = cq_attr.cq_size - 1;
5427         }
5428 
5429         if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5430             &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5431                 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5432                     "failed, ret=%d\n", ret);
5433                 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5434                     state->id_rxwcs_size);
5435                 (void) ibt_free_cq(state->id_rcq_hdl);
5436                 return (DDI_FAILURE);
5437         }
5438         if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5439             state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5440                 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5441                     "moderation failed, ret=%d\n", ret);
5442         }
5443 
5444         state->id_txwcs_size = IBD_TX_POLL_THRESH;
5445         state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5446             state->id_txwcs_size, KM_SLEEP);
5447 
5448         /*
5449          * Print message in case we could not allocate as many wqe's
5450          * as was requested.
5451          */
5452         if (num_rwqe_change) {
5453                 ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5454                     "%d", state->id_ud_num_rwqe, num_rwqe_change);
5455         }
5456         if (num_swqe_change) {
5457                 ibd_print_warn(state, "Setting #swqe = %d instead of default "
5458                     "%d", state->id_ud_num_swqe, num_swqe_change);
5459         }
5460 
5461         return (DDI_SUCCESS);
5462 }
5463 
5464 static int
5465 ibd_setup_ud_channel(ibd_state_t *state)
5466 {
5467         ibt_ud_chan_alloc_args_t ud_alloc_attr;
5468         ibt_ud_chan_query_attr_t ud_chan_attr;
5469         ibt_status_t ret;
5470 
5471         ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5472         if (state->id_hca_res_lkey_capab)
5473                 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5474         if (state->id_lso_policy && state->id_lso_capable)
5475                 ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5476 
5477         ud_alloc_attr.ud_hca_port_num   = state->id_port;
5478         ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5479         ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5480         ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5481         ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5482         ud_alloc_attr.ud_qkey           = state->id_mcinfo->mc_qkey;
5483         ud_alloc_attr.ud_scq            = state->id_scq_hdl;
5484         ud_alloc_attr.ud_rcq            = state->id_rcq_hdl;
5485         ud_alloc_attr.ud_pd             = state->id_pd_hdl;
5486         ud_alloc_attr.ud_pkey_ix        = state->id_pkix;
5487         ud_alloc_attr.ud_clone_chan     = NULL;
5488 
5489         if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5490             &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5491                 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5492                     "failed, ret=%d\n", ret);
5493                 return (DDI_FAILURE);
5494         }
5495 
5496         if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5497             &ud_chan_attr)) != IBT_SUCCESS) {
5498                 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5499                     "failed, ret=%d\n", ret);
5500                 (void) ibt_free_channel(state->id_chnl_hdl);
5501                 return (DDI_FAILURE);
5502         }
5503 
5504         state->id_qpnum = ud_chan_attr.ud_qpn;
5505 
5506         return (DDI_SUCCESS);
5507 }
5508 
5509 static int
5510 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5511 {
5512         uint32_t progress = state->id_mac_state;
5513         uint_t attempts;
5514         ibt_status_t ret;
5515         ib_gid_t mgid;
5516         ibd_mce_t *mce;
5517         uint8_t jstate;
5518         timeout_id_t tid;
5519 
5520         if (atomic_dec_32_nv(&state->id_running) != 0)
5521                 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5522 
5523         /*
5524          * Before we try to stop/undo whatever we did in ibd_start(),
5525          * we need to mark the link state appropriately to prevent the
5526          * ip layer from using this instance for any new transfers. Note
5527          * that if the original state of the link was "up" when we're
5528          * here, we'll set the final link state to "unknown", to behave
5529          * in the same fashion as other ethernet drivers.
5530          */
5531         mutex_enter(&state->id_link_mutex);
5532         if (cur_link_state == LINK_STATE_DOWN) {
5533                 state->id_link_state = cur_link_state;
5534         } else {
5535                 state->id_link_state = LINK_STATE_UNKNOWN;
5536         }
5537         mutex_exit(&state->id_link_mutex);
5538         bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5539         mac_link_update(state->id_mh, state->id_link_state);
5540 
5541         state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5542         if (progress & IBD_DRV_STARTED) {
5543                 state->id_mac_state &= (~IBD_DRV_STARTED);
5544         }
5545 
5546         if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5547                 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5548         }
5549 
5550         /* Stop listen under Reliable Connected Mode */
5551         if (progress & IBD_DRV_RC_LISTEN) {
5552                 ASSERT(state->id_enable_rc);
5553                 if (state->rc_listen_hdl != NULL) {
5554                         ibd_rc_stop_listen(state);
5555                 }
5556                 state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5557         }
5558 
5559         /* Stop timeout routine */
5560         if (progress & IBD_DRV_RC_TIMEOUT) {
5561                 ASSERT(state->id_enable_rc);
5562                 mutex_enter(&state->rc_timeout_lock);
5563                 state->rc_timeout_start = B_FALSE;
5564                 tid = state->rc_timeout;
5565                 state->rc_timeout = 0;
5566                 mutex_exit(&state->rc_timeout_lock);
5567                 if (tid != 0)
5568                         (void) untimeout(tid);
5569                 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5570         }
5571 
5572         if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5573                 attempts = 100;
5574                 while (state->id_ah_op == IBD_OP_ONGOING) {
5575                         /*
5576                          * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5577                          * port is connecting to a remote IPoIB port. Wait for
5578                          * the end of this connecting operation.
5579                          */
5580                         delay(drv_usectohz(100000));
5581                         if (--attempts == 0) {
5582                                 state->rc_stop_connect++;
5583                                 DPRINT(40, "ibd_undo_start: connecting");
5584                                 break;
5585                         }
5586                 }
5587                 mutex_enter(&state->id_sched_lock);
5588                 state->id_sched_needed = 0;
5589                 mutex_exit(&state->id_sched_lock);
5590                 (void) ibd_rc_close_all_chan(state);
5591         }
5592 
5593         /*
5594          * First, stop receive interrupts; this stops the driver from
5595          * handing up buffers to higher layers.  Wait for receive buffers
5596          * to be returned and give up after 1 second.
5597          */
5598         if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5599                 attempts = 10;
5600                 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5601                     0) > 0) {
5602                         delay(drv_usectohz(100000));
5603                         if (--attempts == 0) {
5604                                 /*
5605                                  * There are pending bufs with the network
5606                                  * layer and we have no choice but to wait
5607                                  * for them to be done with. Reap all the
5608                                  * Tx/Rx completions that were posted since
5609                                  * we turned off the notification and
5610                                  * return failure.
5611                                  */
5612                                 cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5613                                 DPRINT(2, "ibd_undo_start: "
5614                                     "reclaiming failed");
5615                                 break;
5616                         }
5617                 }
5618                 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5619         }
5620 
5621         if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5622                 ibd_rc_fini_tx_largebuf_list(state);
5623                 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5624         }
5625 
5626         if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5627                 ASSERT(state->id_enable_rc);
5628                 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5629                         if (state->id_ah_op == IBD_OP_ONGOING) {
5630                                 delay(drv_usectohz(10000));
5631                                 if (state->id_ah_op == IBD_OP_ONGOING) {
5632                                         /*
5633                                          * "state->id_ah_op == IBD_OP_ONGOING"
5634                                          * means this IPoIB port is connecting
5635                                          * to a remote IPoIB port. We can't
5636                                          * delete SRQ here.
5637                                          */
5638                                         state->rc_stop_connect++;
5639                                         DPRINT(40, "ibd_undo_start: "
5640                                             "connecting");
5641                                 } else {
5642                                         ibd_rc_fini_srq_list(state);
5643                                         state->id_mac_state &=
5644                                             (~IBD_DRV_RC_SRQ_ALLOCD);
5645                                 }
5646                         } else {
5647                                 ibd_rc_fini_srq_list(state);
5648                                 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5649                         }
5650                 } else {
5651                         DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5652                 }
5653         }
5654 
5655         if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5656                 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5657 
5658                 mutex_enter(&state->id_trap_lock);
5659                 state->id_trap_stop = B_TRUE;
5660                 while (state->id_trap_inprog > 0)
5661                         cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5662                 mutex_exit(&state->id_trap_lock);
5663 
5664                 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5665         }
5666 
5667         if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5668                 /*
5669                  * Flushing the channel ensures that all pending WQE's
5670                  * are marked with flush_error and handed to the CQ. It
5671                  * does not guarantee the invocation of the CQ handler.
5672                  * This call is guaranteed to return successfully for
5673                  * UD QPNs.
5674                  */
5675                 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5676                     IBT_SUCCESS) {
5677                         DPRINT(10, "ibd_undo_start: flush_channel "
5678                             "failed, ret=%d", ret);
5679                 }
5680 
5681                 /*
5682                  * Give some time for the TX CQ handler to process the
5683                  * completions.
5684                  */
5685                 attempts = 10;
5686                 mutex_enter(&state->id_tx_list.dl_mutex);
5687                 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5688                 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5689                     != state->id_ud_num_swqe) {
5690                         if (--attempts == 0)
5691                                 break;
5692                         mutex_exit(&state->id_tx_rel_list.dl_mutex);
5693                         mutex_exit(&state->id_tx_list.dl_mutex);
5694                         delay(drv_usectohz(100000));
5695                         mutex_enter(&state->id_tx_list.dl_mutex);
5696                         mutex_enter(&state->id_tx_rel_list.dl_mutex);
5697                 }
5698                 ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5699                 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5700                     state->id_ud_num_swqe) {
5701                         cmn_err(CE_WARN, "tx resources not freed\n");
5702                 }
5703                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5704                 mutex_exit(&state->id_tx_list.dl_mutex);
5705 
5706                 attempts = 10;
5707                 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5708                         if (--attempts == 0)
5709                                 break;
5710                         delay(drv_usectohz(100000));
5711                 }
5712                 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5713                 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5714                         cmn_err(CE_WARN, "rx resources not freed\n");
5715                 }
5716 
5717                 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5718         }
5719 
5720         if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5721                 /*
5722                  * Drop all residual full/non membership. This includes full
5723                  * membership to the broadcast group, and any nonmembership
5724                  * acquired during transmits. We do this after the Tx completion
5725                  * handlers are done, since those might result in some late
5726                  * leaves; this also eliminates a potential race with that
5727                  * path wrt the mc full list insert/delete. Trap handling
5728                  * has also been suppressed at this point. Thus, no locks
5729                  * are required while traversing the mc full list.
5730                  */
5731                 DPRINT(2, "ibd_undo_start: clear full cache entries");
5732                 mce = list_head(&state->id_mc_full);
5733                 while (mce != NULL) {
5734                         mgid = mce->mc_info.mc_adds_vect.av_dgid;
5735                         jstate = mce->mc_jstate;
5736                         mce = list_next(&state->id_mc_full, mce);
5737                         ibd_leave_group(state, mgid, jstate);
5738                 }
5739                 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5740         }
5741 
5742         if (progress & IBD_DRV_RXLIST_ALLOCD) {
5743                 ibd_fini_rxlist(state);
5744                 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5745         }
5746 
5747         if (progress & IBD_DRV_TXLIST_ALLOCD) {
5748                 ibd_fini_txlist(state);
5749                 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5750         }
5751 
5752         if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5753                 if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5754                     IBT_SUCCESS) {
5755                         DPRINT(10, "ibd_undo_start: free_channel "
5756                             "failed, ret=%d", ret);
5757                 }
5758 
5759                 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5760         }
5761 
5762         if (progress & IBD_DRV_CQS_ALLOCD) {
5763                 kmem_free(state->id_txwcs,
5764                     sizeof (ibt_wc_t) * state->id_txwcs_size);
5765                 if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5766                     IBT_SUCCESS) {
5767                         DPRINT(10, "ibd_undo_start: free_cq(scq) "
5768                             "failed, ret=%d", ret);
5769                 }
5770 
5771                 kmem_free(state->id_rxwcs,
5772                     sizeof (ibt_wc_t) * state->id_rxwcs_size);
5773                 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5774                         DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5775                             "ret=%d", ret);
5776                 }
5777 
5778                 state->id_txwcs = NULL;
5779                 state->id_rxwcs = NULL;
5780                 state->id_scq_hdl = NULL;
5781                 state->id_rcq_hdl = NULL;
5782 
5783                 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5784         }
5785 
5786         if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5787                 mutex_enter(&state->id_ac_mutex);
5788                 mod_hash_destroy_hash(state->id_ah_active_hash);
5789                 mutex_exit(&state->id_ac_mutex);
5790                 ibd_acache_fini(state);
5791 
5792                 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5793         }
5794 
5795         if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5796                 /*
5797                  * If we'd created the ipoib broadcast group and had
5798                  * successfully joined it, leave it now
5799                  */
5800                 if (state->id_bgroup_created) {
5801                         mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5802                         jstate = IB_MC_JSTATE_FULL;
5803                         (void) ibt_leave_mcg(state->id_sgid, mgid,
5804                             state->id_sgid, jstate);
5805                 }
5806                 ibt_free_mcg_info(state->id_mcinfo, 1);
5807 
5808                 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5809         }
5810 
5811         return (DDI_SUCCESS);
5812 }
5813 
5814 /*
5815  * These pair of routines are used to set/clear the condition that
5816  * the caller is likely to do something to change the id_mac_state.
5817  * If there's already someone doing either a start or a stop (possibly
5818  * due to the async handler detecting a pkey relocation event, a plumb
5819  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5820  * that's done.
5821  */
5822 static void
5823 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5824 {
5825         mutex_enter(&state->id_macst_lock);
5826         while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5827                 cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5828 
5829         state->id_mac_state |= flag;
5830         mutex_exit(&state->id_macst_lock);
5831 }
5832 
5833 static void
5834 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5835 {
5836         mutex_enter(&state->id_macst_lock);
5837         state->id_mac_state &= (~flag);
5838         cv_signal(&state->id_macst_cv);
5839         mutex_exit(&state->id_macst_lock);
5840 }
5841 
5842 /*
5843  * GLDv3 entry point to start hardware.
5844  */
5845 /*ARGSUSED*/
5846 static int
5847 ibd_m_start(void *arg)
5848 {
5849         ibd_state_t *state = arg;
5850         int     ret;
5851 
5852         if (state->id_type == IBD_PORT_DRIVER)
5853                 return (EINVAL);
5854 
5855         ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5856         if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5857                 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5858                 return (EIO);
5859         }
5860 
5861         ret = ibd_start(state);
5862         ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5863         return (ret);
5864 }
5865 
5866 static int
5867 ibd_start(ibd_state_t *state)
5868 {
5869         int err;
5870         ibt_status_t ret;
5871         int late_hca_init = 0;
5872 
5873         if (state->id_mac_state & IBD_DRV_STARTED)
5874                 return (DDI_SUCCESS);
5875 
5876         /*
5877          * We do not increment the running flag when calling ibd_start() as
5878          * a result of some event which moves the state away from late HCA
5879          * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5880          */
5881         if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5882             (atomic_inc_32_nv(&state->id_running) != 1)) {
5883                 DPRINT(10, "ibd_start: id_running is non-zero");
5884                 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5885                 atomic_dec_32(&state->id_running);
5886                 return (EINVAL);
5887         }
5888 
5889         /*
5890          * Get port details; if we fail here, something bad happened.
5891          * Fail plumb.
5892          */
5893         if ((err = ibd_get_port_details(state)) != 0) {
5894                 DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5895                 goto start_fail;
5896         }
5897         /*
5898          * If state->id_link_state is DOWN, it indicates that either the port
5899          * is down, or the pkey is not available. In both cases, resort to late
5900          * initialization. Register for subnet notices, and return success.
5901          */
5902         state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5903         if (state->id_link_state == LINK_STATE_DOWN) {
5904                 late_hca_init = 1;
5905                 goto late_hca_init_return;
5906         }
5907 
5908         /*
5909          * Find the IPoIB broadcast group
5910          */
5911         if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5912                 /* Resort to late initialization */
5913                 late_hca_init = 1;
5914                 goto reg_snet_notices;
5915         }
5916         state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5917 
5918         /*
5919          * Initialize per-interface caches and lists; if we fail here,
5920          * it is most likely due to a lack of resources
5921          */
5922         if (ibd_acache_init(state) != DDI_SUCCESS) {
5923                 DPRINT(10, "ibd_start: ibd_acache_init() failed");
5924                 err = ENOMEM;
5925                 goto start_fail;
5926         }
5927         state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5928 
5929         /*
5930          * Allocate send and receive completion queues
5931          */
5932         if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5933                 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5934                 err = ENOMEM;
5935                 goto start_fail;
5936         }
5937         state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5938 
5939         /*
5940          * Setup a UD channel
5941          */
5942         if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5943                 err = ENOMEM;
5944                 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5945                 goto start_fail;
5946         }
5947         state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5948 
5949         /*
5950          * Allocate and initialize the tx buffer list
5951          */
5952         if (ibd_init_txlist(state) != DDI_SUCCESS) {
5953                 DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5954                 err = ENOMEM;
5955                 goto start_fail;
5956         }
5957         state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5958 
5959         /*
5960          * Create the send cq handler here
5961          */
5962         ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5963         if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5964             IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5965                 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5966                     "failed, ret=%d", ret);
5967                 err = EINVAL;
5968                 goto start_fail;
5969         }
5970         state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5971 
5972         /*
5973          * Allocate and initialize the rx buffer list
5974          */
5975         if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5976                 DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5977                 err = ENOMEM;
5978                 goto start_fail;
5979         }
5980         state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5981 
5982         /*
5983          * Join IPoIB broadcast group
5984          */
5985         if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5986                 DPRINT(10, "ibd_start: ibd_join_group() failed");
5987                 err = ENOTACTIVE;
5988                 goto start_fail;
5989         }
5990         state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5991 
5992         /*
5993          * When we did mac_register() in ibd_attach(), we didn't register
5994          * the real macaddr and we didn't have the true port mtu. Now that
5995          * we're almost ready, set the local mac address and broadcast
5996          * addresses and update gldv3 about the real values of these
5997          * parameters.
5998          */
5999         if (state->id_enable_rc) {
6000                 ibd_h2n_mac(&state->id_macaddr,
6001                     IBD_MAC_ADDR_RC + state->id_qpnum,
6002                     state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6003                 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
6004                     state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6005         } else {
6006                 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
6007                     state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6008         }
6009         ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
6010             state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
6011 
6012         if (!state->id_enable_rc) {
6013                 (void) mac_maxsdu_update2(state->id_mh,
6014                     state->id_mtu - IPOIB_HDRSIZE,
6015                     state->id_mtu - IPOIB_HDRSIZE);
6016         }
6017         mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6018 
6019         /*
6020          * Setup the receive cq handler
6021          */
6022         ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
6023         if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
6024             IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
6025                 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
6026                     "failed, ret=%d", ret);
6027                 err = EINVAL;
6028                 goto start_fail;
6029         }
6030         state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
6031 
6032 reg_snet_notices:
6033         /*
6034          * In case of normal initialization sequence,
6035          * Setup the subnet notices handler after we've initialized the acache/
6036          * mcache and started the async thread, both of which are required for
6037          * the trap handler to function properly.
6038          *
6039          * Now that the async thread has been started (and we've already done
6040          * a mac_register() during attach so mac_tx_update() can be called
6041          * if necessary without any problem), we can enable the trap handler
6042          * to queue requests to the async thread.
6043          *
6044          * In case of late hca initialization, the subnet notices handler will
6045          * only handle MCG created/deleted event. The action performed as part
6046          * of handling these events is to start the interface. So, the
6047          * acache/mcache initialization is not a necessity in such cases for
6048          * registering the subnet notices handler. Also, if we are in
6049          * ibd_start() as a result of, say, some event handling after entering
6050          * late hca initialization phase no need to register again.
6051          */
6052         if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6053                 ibt_register_subnet_notices(state->id_ibt_hdl,
6054                     ibd_snet_notices_handler, state);
6055                 mutex_enter(&state->id_trap_lock);
6056                 state->id_trap_stop = B_FALSE;
6057                 mutex_exit(&state->id_trap_lock);
6058                 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6059         }
6060 
6061 late_hca_init_return:
6062         if (late_hca_init == 1) {
6063                 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6064                 /*
6065                  * In case of late initialization, mark the link state as down,
6066                  * immaterial of the actual link state as reported in the
6067                  * port_info.
6068                  */
6069                 state->id_link_state = LINK_STATE_DOWN;
6070                 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6071                 mac_link_update(state->id_mh, state->id_link_state);
6072                 return (DDI_SUCCESS);
6073         }
6074 
6075         if (state->id_enable_rc) {
6076                 if (state->rc_enable_srq) {
6077                         if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6078                                 if (ibd_rc_repost_srq_free_list(state) !=
6079                                     IBT_SUCCESS) {
6080                                         err = ENOMEM;
6081                                         goto start_fail;
6082                                 }
6083                         } else {
6084                                 /* Allocate SRQ resource */
6085                                 if (ibd_rc_init_srq_list(state) !=
6086                                     IBT_SUCCESS) {
6087                                         err = ENOMEM;
6088                                         goto start_fail;
6089                                 }
6090                                 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6091                         }
6092                 }
6093 
6094                 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6095                         DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6096                             "failed");
6097                         err = ENOMEM;
6098                         goto start_fail;
6099                 }
6100                 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6101 
6102                 /* RC: begin to listen only after everything is available */
6103                 if (ibd_rc_listen(state) != IBT_SUCCESS) {
6104                         DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6105                         err = EINVAL;
6106                         goto start_fail;
6107                 }
6108                 state->id_mac_state |= IBD_DRV_RC_LISTEN;
6109         }
6110 
6111         /*
6112          * Indicate link status to GLDv3 and higher layers. By default,
6113          * we assume we are in up state (which must have been true at
6114          * least at the time the broadcast mcg's were probed); if there
6115          * were any up/down transitions till the time we come here, the
6116          * async handler will have updated last known state, which we
6117          * use to tell GLDv3. The async handler will not send any
6118          * notifications to GLDv3 till we reach here in the initialization
6119          * sequence.
6120          */
6121         mac_link_update(state->id_mh, state->id_link_state);
6122         state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6123         state->id_mac_state |= IBD_DRV_STARTED;
6124 
6125         /* Start timer after everything is ready */
6126         if (state->id_enable_rc) {
6127                 mutex_enter(&state->rc_timeout_lock);
6128                 state->rc_timeout_start = B_TRUE;
6129                 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
6130                     SEC_TO_TICK(ibd_rc_conn_timeout));
6131                 mutex_exit(&state->rc_timeout_lock);
6132                 state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
6133         }
6134 
6135         return (DDI_SUCCESS);
6136 
6137 start_fail:
6138         /*
6139          * If we ran into a problem during ibd_start() and ran into
6140          * some other problem during undoing our partial work, we can't
6141          * do anything about it.  Ignore any errors we might get from
6142          * ibd_undo_start() and just return the original error we got.
6143          */
6144         (void) ibd_undo_start(state, LINK_STATE_DOWN);
6145         return (err);
6146 }
6147 
6148 /*
6149  * GLDv3 entry point to stop hardware from receiving packets.
6150  */
6151 /*ARGSUSED*/
6152 static void
6153 ibd_m_stop(void *arg)
6154 {
6155         ibd_state_t *state = (ibd_state_t *)arg;
6156 
6157         if (state->id_type == IBD_PORT_DRIVER)
6158                 return;
6159 
6160         ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6161 
6162         (void) ibd_undo_start(state, state->id_link_state);
6163 
6164         ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6165 }
6166 
6167 /*
6168  * GLDv3 entry point to modify device's mac address. We do not
6169  * allow address modifications.
6170  */
6171 static int
6172 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6173 {
6174         ibd_state_t *state = arg;
6175 
6176         if (state->id_type == IBD_PORT_DRIVER)
6177                 return (EINVAL);
6178 
6179         /*
6180          * Don't bother even comparing the macaddr if we haven't
6181          * completed ibd_m_start().
6182          */
6183         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6184                 return (0);
6185 
6186         if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6187                 return (0);
6188         else
6189                 return (EINVAL);
6190 }
6191 
6192 /*
6193  * The blocking part of the IBA join/leave operations are done out
6194  * of here on the async thread.
6195  */
6196 static void
6197 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6198 {
6199         DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6200             "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6201 
6202         if (op == IBD_ASYNC_JOIN) {
6203                 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6204                         ibd_print_warn(state, "Join multicast group failed :"
6205                         "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6206                 }
6207         } else {
6208                 /*
6209                  * Here, we must search for the proper mcg_info and
6210                  * use that to leave the group.
6211                  */
6212                 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6213         }
6214 }
6215 
6216 /*
6217  * GLDv3 entry point for multicast enable/disable requests.
6218  * This function queues the operation to the async thread and
6219  * return success for a valid multicast address.
6220  */
6221 static int
6222 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6223 {
6224         ibd_state_t *state = (ibd_state_t *)arg;
6225         ipoib_mac_t maddr, *mcast;
6226         ib_gid_t mgid;
6227         ibd_req_t *req;
6228 
6229         if (state->id_type == IBD_PORT_DRIVER)
6230                 return (EINVAL);
6231 
6232         /*
6233          * If we haven't completed ibd_m_start(), async thread wouldn't
6234          * have been started and id_bcaddr wouldn't be set, so there's
6235          * no point in continuing.
6236          */
6237         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6238                 return (0);
6239 
6240         /*
6241          * The incoming multicast address might not be aligned properly
6242          * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6243          * it to look like one though, to get the offsets of the mc gid,
6244          * since we know we are not going to dereference any values with
6245          * the ipoib_mac_t pointer.
6246          */
6247         bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6248         mcast = &maddr;
6249 
6250         /*
6251          * Check validity of MCG address. We could additionally check
6252          * that a enable/disable is not being issued on the "broadcast"
6253          * mcg, but since this operation is only invokable by privileged
6254          * programs anyway, we allow the flexibility to those dlpi apps.
6255          * Note that we do not validate the "scope" of the IBA mcg.
6256          */
6257         if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6258                 return (EINVAL);
6259 
6260         /*
6261          * fill in multicast pkey and scope
6262          */
6263         IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6264 
6265         /*
6266          * If someone is trying to JOIN/LEAVE the broadcast group, we do
6267          * nothing (i.e. we stay JOINed to the broadcast group done in
6268          * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6269          * requires to be joined to broadcast groups at all times.
6270          * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6271          * depends on this.
6272          */
6273         if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6274                 return (0);
6275 
6276         ibd_n2h_gid(mcast, &mgid);
6277         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6278         if (req == NULL)
6279                 return (ENOMEM);
6280 
6281         req->rq_gid = mgid;
6282 
6283         if (add) {
6284                 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6285                     mgid.gid_prefix, mgid.gid_guid);
6286                 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6287         } else {
6288                 DPRINT(1, "ibd_m_multicst : unset_multicast : "
6289                     "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6290                 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6291         }
6292         return (0);
6293 }
6294 
6295 /*
6296  * The blocking part of the IBA promiscuous operations are done
6297  * out of here on the async thread. The dlpireq parameter indicates
6298  * whether this invocation is due to a dlpi request or due to
6299  * a port up/down event.
6300  */
6301 static void
6302 ibd_async_unsetprom(ibd_state_t *state)
6303 {
6304         ibd_mce_t *mce = list_head(&state->id_mc_non);
6305         ib_gid_t mgid;
6306 
6307         DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6308 
6309         while (mce != NULL) {
6310                 mgid = mce->mc_info.mc_adds_vect.av_dgid;
6311                 mce = list_next(&state->id_mc_non, mce);
6312                 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6313         }
6314         state->id_prom_op = IBD_OP_NOTSTARTED;
6315 }
6316 
6317 /*
6318  * The blocking part of the IBA promiscuous operations are done
6319  * out of here on the async thread. The dlpireq parameter indicates
6320  * whether this invocation is due to a dlpi request or due to
6321  * a port up/down event.
6322  */
6323 static void
6324 ibd_async_setprom(ibd_state_t *state)
6325 {
6326         ibt_mcg_attr_t mcg_attr;
6327         ibt_mcg_info_t *mcg_info;
6328         ib_gid_t mgid;
6329         uint_t numg;
6330         int i;
6331         char ret = IBD_OP_COMPLETED;
6332 
6333         DPRINT(2, "ibd_async_setprom : async_set_promisc");
6334 
6335         /*
6336          * Obtain all active MC groups on the IB fabric with
6337          * specified criteria (scope + Pkey + Qkey + mtu).
6338          */
6339         bzero(&mcg_attr, sizeof (mcg_attr));
6340         mcg_attr.mc_pkey = state->id_pkey;
6341         mcg_attr.mc_scope = state->id_scope;
6342         mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6343         mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6344         mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6345         if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6346             IBT_SUCCESS) {
6347                 ibd_print_warn(state, "Could not get list of IBA multicast "
6348                     "groups");
6349                 ret = IBD_OP_ERRORED;
6350                 goto done;
6351         }
6352 
6353         /*
6354          * Iterate over the returned mcg's and join as NonMember
6355          * to the IP mcg's.
6356          */
6357         for (i = 0; i < numg; i++) {
6358                 /*
6359                  * Do a NonMember JOIN on the MC group.
6360                  */
6361                 mgid = mcg_info[i].mc_adds_vect.av_dgid;
6362                 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6363                         ibd_print_warn(state, "IBA promiscuous mode missed "
6364                             "multicast gid %016llx:%016llx",
6365                             (u_longlong_t)mgid.gid_prefix,
6366                             (u_longlong_t)mgid.gid_guid);
6367         }
6368 
6369         ibt_free_mcg_info(mcg_info, numg);
6370         DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6371 done:
6372         state->id_prom_op = ret;
6373 }
6374 
6375 /*
6376  * GLDv3 entry point for multicast promiscuous enable/disable requests.
6377  * GLDv3 assumes phys state receives more packets than multi state,
6378  * which is not true for IPoIB. Thus, treat the multi and phys
6379  * promiscuous states the same way to work with GLDv3's assumption.
6380  */
6381 static int
6382 ibd_m_promisc(void *arg, boolean_t on)
6383 {
6384         ibd_state_t *state = (ibd_state_t *)arg;
6385         ibd_req_t *req;
6386 
6387         if (state->id_type == IBD_PORT_DRIVER)
6388                 return (EINVAL);
6389 
6390         /*
6391          * Async thread wouldn't have been started if we haven't
6392          * passed ibd_m_start()
6393          */
6394         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6395                 return (0);
6396 
6397         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6398         if (req == NULL)
6399                 return (ENOMEM);
6400         if (on) {
6401                 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6402                 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6403         } else {
6404                 DPRINT(1, "ibd_m_promisc : unset_promisc");
6405                 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6406         }
6407 
6408         return (0);
6409 }
6410 
6411 /*
6412  * GLDv3 entry point for gathering statistics.
6413  */
6414 static int
6415 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6416 {
6417         ibd_state_t *state = (ibd_state_t *)arg;
6418 
6419         switch (stat) {
6420         case MAC_STAT_IFSPEED:
6421                 *val = state->id_link_speed;
6422                 break;
6423         case MAC_STAT_MULTIRCV:
6424                 *val = state->id_multi_rcv;
6425                 break;
6426         case MAC_STAT_BRDCSTRCV:
6427                 *val = state->id_brd_rcv;
6428                 break;
6429         case MAC_STAT_MULTIXMT:
6430                 *val = state->id_multi_xmt;
6431                 break;
6432         case MAC_STAT_BRDCSTXMT:
6433                 *val = state->id_brd_xmt;
6434                 break;
6435         case MAC_STAT_RBYTES:
6436                 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6437                     + state->rc_rcv_copy_byte;
6438                 break;
6439         case MAC_STAT_IPACKETS:
6440                 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6441                     + state->rc_rcv_copy_pkt;
6442                 break;
6443         case MAC_STAT_OBYTES:
6444                 *val = state->id_xmt_bytes + state->rc_xmt_bytes;
6445                 break;
6446         case MAC_STAT_OPACKETS:
6447                 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6448                     state->rc_xmt_fragmented_pkt +
6449                     state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6450                 break;
6451         case MAC_STAT_OERRORS:
6452                 *val = state->id_ah_error;   /* failed AH translation */
6453                 break;
6454         case MAC_STAT_IERRORS:
6455                 *val = 0;
6456                 break;
6457         case MAC_STAT_NOXMTBUF:
6458                 *val = state->id_tx_short + state->rc_swqe_short +
6459                     state->rc_xmt_buf_short;
6460                 break;
6461         case MAC_STAT_NORCVBUF:
6462         default:
6463                 return (ENOTSUP);
6464         }
6465 
6466         return (0);
6467 }
6468 
6469 static void
6470 ibd_async_txsched(ibd_state_t *state)
6471 {
6472         ibd_resume_transmission(state);
6473 }
6474 
6475 static void
6476 ibd_resume_transmission(ibd_state_t *state)
6477 {
6478         int flag;
6479         int met_thresh = 0;
6480         int thresh = 0;
6481         int ret = -1;
6482 
6483         mutex_enter(&state->id_sched_lock);
6484         if (state->id_sched_needed & IBD_RSRC_SWQE) {
6485                 mutex_enter(&state->id_tx_list.dl_mutex);
6486                 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6487                 met_thresh = state->id_tx_list.dl_cnt +
6488                     state->id_tx_rel_list.dl_cnt;
6489                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6490                 mutex_exit(&state->id_tx_list.dl_mutex);
6491                 thresh = IBD_FREE_SWQES_THRESH;
6492                 flag = IBD_RSRC_SWQE;
6493         } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6494                 ASSERT(state->id_lso != NULL);
6495                 mutex_enter(&state->id_lso_lock);
6496                 met_thresh = state->id_lso->bkt_nfree;
6497                 thresh = IBD_FREE_LSOS_THRESH;
6498                 mutex_exit(&state->id_lso_lock);
6499                 flag = IBD_RSRC_LSOBUF;
6500                 if (met_thresh > thresh)
6501                         state->id_sched_lso_cnt++;
6502         }
6503         if (met_thresh > thresh) {
6504                 state->id_sched_needed &= ~flag;
6505                 state->id_sched_cnt++;
6506                 ret = 0;
6507         }
6508         mutex_exit(&state->id_sched_lock);
6509 
6510         if (ret == 0)
6511                 mac_tx_update(state->id_mh);
6512 }
6513 
6514 /*
6515  * Release the send wqe back into free list.
6516  */
6517 static void
6518 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6519 {
6520         /*
6521          * Add back on Tx list for reuse.
6522          */
6523         ASSERT(tail->swqe_next == NULL);
6524         mutex_enter(&state->id_tx_rel_list.dl_mutex);
6525         state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6526         tail->swqe_next = state->id_tx_rel_list.dl_head;
6527         state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6528         state->id_tx_rel_list.dl_cnt += n;
6529         mutex_exit(&state->id_tx_rel_list.dl_mutex);
6530 }
6531 
6532 /*
6533  * Acquire a send wqe from free list.
6534  * Returns error number and send wqe pointer.
6535  */
6536 static ibd_swqe_t *
6537 ibd_acquire_swqe(ibd_state_t *state)
6538 {
6539         ibd_swqe_t *wqe;
6540 
6541         mutex_enter(&state->id_tx_rel_list.dl_mutex);
6542         if (state->id_tx_rel_list.dl_head != NULL) {
6543                 /* transfer id_tx_rel_list to id_tx_list */
6544                 state->id_tx_list.dl_head =
6545                     state->id_tx_rel_list.dl_head;
6546                 state->id_tx_list.dl_cnt =
6547                     state->id_tx_rel_list.dl_cnt;
6548                 state->id_tx_list.dl_pending_sends = B_FALSE;
6549 
6550                 /* clear id_tx_rel_list */
6551                 state->id_tx_rel_list.dl_head = NULL;
6552                 state->id_tx_rel_list.dl_cnt = 0;
6553                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6554 
6555                 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6556                 state->id_tx_list.dl_cnt -= 1;
6557                 state->id_tx_list.dl_head = wqe->swqe_next;
6558         } else {        /* no free swqe */
6559                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6560                 state->id_tx_list.dl_pending_sends = B_TRUE;
6561                 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6562                 state->id_tx_short++;
6563                 wqe = NULL;
6564         }
6565         return (wqe);
6566 }
6567 
6568 static int
6569 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6570     ibt_ud_dest_hdl_t ud_dest)
6571 {
6572         mblk_t  *nmp;
6573         int iph_len, tcph_len;
6574         ibt_wr_lso_t *lso;
6575         uintptr_t ip_start, tcp_start;
6576         uint8_t *dst;
6577         uint_t pending, mblen;
6578 
6579         /*
6580          * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6581          * we need to adjust it here for lso.
6582          */
6583         lso = &(node->w_swr.wr.ud_lso);
6584         lso->lso_ud_dest = ud_dest;
6585         lso->lso_mss = mss;
6586 
6587         /*
6588          * Calculate the LSO header size and set it in the UD LSO structure.
6589          * Note that the only assumption we make is that each of the IPoIB,
6590          * IP and TCP headers will be contained in a single mblk fragment;
6591          * together, the headers may span multiple mblk fragments.
6592          */
6593         nmp = mp;
6594         ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6595         if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6596                 ip_start = (uintptr_t)nmp->b_cont->b_rptr
6597                     + (ip_start - (uintptr_t)(nmp->b_wptr));
6598                 nmp = nmp->b_cont;
6599 
6600         }
6601         iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6602 
6603         tcp_start = ip_start + iph_len;
6604         if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6605                 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6606                     + (tcp_start - (uintptr_t)(nmp->b_wptr));
6607                 nmp = nmp->b_cont;
6608         }
6609         tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6610         lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6611 
6612         /*
6613          * If the lso header fits entirely within a single mblk fragment,
6614          * we'll avoid an additional copy of the lso header here and just
6615          * pass the b_rptr of the mblk directly.
6616          *
6617          * If this isn't true, we'd have to allocate for it explicitly.
6618          */
6619         if (lso->lso_hdr_sz <= MBLKL(mp)) {
6620                 lso->lso_hdr = mp->b_rptr;
6621         } else {
6622                 /* On work completion, remember to free this allocated hdr */
6623                 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6624                 if (lso->lso_hdr == NULL) {
6625                         DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6626                             "sz = %d", lso->lso_hdr_sz);
6627                         lso->lso_hdr_sz = 0;
6628                         lso->lso_mss = 0;
6629                         return (-1);
6630                 }
6631         }
6632 
6633         /*
6634          * Copy in the lso header only if we need to
6635          */
6636         if (lso->lso_hdr != mp->b_rptr) {
6637                 dst = lso->lso_hdr;
6638                 pending = lso->lso_hdr_sz;
6639 
6640                 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6641                         mblen = MBLKL(nmp);
6642                         if (pending > mblen) {
6643                                 bcopy(nmp->b_rptr, dst, mblen);
6644                                 dst += mblen;
6645                                 pending -= mblen;
6646                         } else {
6647                                 bcopy(nmp->b_rptr, dst, pending);
6648                                 break;
6649                         }
6650                 }
6651         }
6652 
6653         return (0);
6654 }
6655 
6656 static void
6657 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6658 {
6659         ibt_wr_lso_t *lso;
6660 
6661         if ((!node) || (!mp))
6662                 return;
6663 
6664         /*
6665          * Free any header space that we might've allocated if we
6666          * did an LSO
6667          */
6668         if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6669                 lso = &(node->w_swr.wr.ud_lso);
6670                 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6671                         kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6672                         lso->lso_hdr = NULL;
6673                         lso->lso_hdr_sz = 0;
6674                 }
6675         }
6676 }
6677 
6678 static void
6679 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6680 {
6681         uint_t          i;
6682         uint_t          num_posted;
6683         uint_t          n_wrs;
6684         ibt_status_t    ibt_status;
6685         ibt_send_wr_t   wrs[IBD_MAX_TX_POST_MULTIPLE];
6686         ibd_swqe_t      *tx_head, *elem;
6687         ibd_swqe_t      *nodes[IBD_MAX_TX_POST_MULTIPLE];
6688 
6689         /* post the one request, then check for more */
6690         ibt_status = ibt_post_send(state->id_chnl_hdl,
6691             &node->w_swr, 1, NULL);
6692         if (ibt_status != IBT_SUCCESS) {
6693                 ibd_print_warn(state, "ibd_post_send: "
6694                     "posting one wr failed: ret=%d", ibt_status);
6695                 ibd_tx_cleanup(state, node);
6696         }
6697 
6698         tx_head = NULL;
6699         for (;;) {
6700                 if (tx_head == NULL) {
6701                         mutex_enter(&state->id_txpost_lock);
6702                         tx_head = state->id_tx_head;
6703                         if (tx_head == NULL) {
6704                                 state->id_tx_busy = 0;
6705                                 mutex_exit(&state->id_txpost_lock);
6706                                 return;
6707                         }
6708                         state->id_tx_head = NULL;
6709                         mutex_exit(&state->id_txpost_lock);
6710                 }
6711 
6712                 /*
6713                  * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6714                  * at a time if possible, and keep posting them.
6715                  */
6716                 for (n_wrs = 0, elem = tx_head;
6717                     (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6718                     elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6719                         nodes[n_wrs] = elem;
6720                         wrs[n_wrs] = elem->w_swr;
6721                 }
6722                 tx_head = elem;
6723 
6724                 ASSERT(n_wrs != 0);
6725 
6726                 /*
6727                  * If posting fails for some reason, we'll never receive
6728                  * completion intimation, so we'll need to cleanup. But
6729                  * we need to make sure we don't clean up nodes whose
6730                  * wrs have been successfully posted. We assume that the
6731                  * hca driver returns on the first failure to post and
6732                  * therefore the first 'num_posted' entries don't need
6733                  * cleanup here.
6734                  */
6735                 num_posted = 0;
6736                 ibt_status = ibt_post_send(state->id_chnl_hdl,
6737                     wrs, n_wrs, &num_posted);
6738                 if (ibt_status != IBT_SUCCESS) {
6739                         ibd_print_warn(state, "ibd_post_send: "
6740                             "posting multiple wrs failed: "
6741                             "requested=%d, done=%d, ret=%d",
6742                             n_wrs, num_posted, ibt_status);
6743 
6744                         for (i = num_posted; i < n_wrs; i++)
6745                                 ibd_tx_cleanup(state, nodes[i]);
6746                 }
6747         }
6748 }
6749 
6750 static int
6751 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6752     uint_t lsohdr_sz)
6753 {
6754         ibt_wr_ds_t *sgl;
6755         ibt_status_t ibt_status;
6756         mblk_t *nmp;
6757         mblk_t *data_mp;
6758         uchar_t *bufp;
6759         size_t blksize;
6760         size_t skip;
6761         size_t avail;
6762         uint_t pktsize;
6763         uint_t frag_len;
6764         uint_t pending_hdr;
6765         int nmblks;
6766         int i;
6767 
6768         /*
6769          * Let's skip ahead to the data if this is LSO
6770          */
6771         data_mp = mp;
6772         pending_hdr = 0;
6773         if (lsohdr_sz) {
6774                 pending_hdr = lsohdr_sz;
6775                 for (nmp = mp; nmp; nmp = nmp->b_cont) {
6776                         frag_len = nmp->b_wptr - nmp->b_rptr;
6777                         if (frag_len > pending_hdr)
6778                                 break;
6779                         pending_hdr -= frag_len;
6780                 }
6781                 data_mp = nmp;  /* start of data past lso header */
6782                 ASSERT(data_mp != NULL);
6783         }
6784 
6785         /*
6786          * Calculate the size of message data and number of msg blocks
6787          */
6788         pktsize = 0;
6789         for (nmblks = 0, nmp = data_mp; nmp != NULL;
6790             nmp = nmp->b_cont, nmblks++) {
6791                 pktsize += MBLKL(nmp);
6792         }
6793         pktsize -= pending_hdr;
6794 
6795         /*
6796          * We only do ibt_map_mem_iov() if the pktsize is above the
6797          * "copy-threshold", and if the number of mp fragments is less than
6798          * the maximum acceptable.
6799          */
6800         if ((state->id_hca_res_lkey_capab) &&
6801             (pktsize > state->id_ud_tx_copy_thresh) &&
6802             (nmblks < state->id_max_sqseg_hiwm)) {
6803                 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6804                 ibt_iov_attr_t iov_attr;
6805 
6806                 iov_attr.iov_as = NULL;
6807                 iov_attr.iov = iov_arr;
6808                 iov_attr.iov_buf = NULL;
6809                 iov_attr.iov_list_len = nmblks;
6810                 iov_attr.iov_wr_nds = state->id_max_sqseg;
6811                 iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6812                 iov_attr.iov_flags = IBT_IOV_SLEEP;
6813 
6814                 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6815                         iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6816                         iov_arr[i].iov_len = MBLKL(nmp);
6817                         if (i == 0) {
6818                                 iov_arr[i].iov_addr += pending_hdr;
6819                                 iov_arr[i].iov_len -= pending_hdr;
6820                         }
6821                 }
6822 
6823                 node->w_buftype = IBD_WQE_MAPPED;
6824                 node->w_swr.wr_sgl = node->w_sgl;
6825 
6826                 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6827                     (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6828                 if (ibt_status != IBT_SUCCESS) {
6829                         ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6830                             "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6831                         goto ibd_copy_path;
6832                 }
6833 
6834                 return (0);
6835         }
6836 
6837 ibd_copy_path:
6838         if (pktsize <= state->id_tx_buf_sz) {
6839                 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6840                 node->w_swr.wr_nds = 1;
6841                 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6842                 node->w_buftype = IBD_WQE_TXBUF;
6843 
6844                 /*
6845                  * Even though this is the copy path for transfers less than
6846                  * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6847                  * is possible the first data mblk fragment (data_mp) still
6848                  * contains part of the LSO header that we need to skip.
6849                  */
6850                 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6851                 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6852                         blksize = MBLKL(nmp) - pending_hdr;
6853                         bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6854                         bufp += blksize;
6855                         pending_hdr = 0;
6856                 }
6857 
6858                 return (0);
6859         }
6860 
6861         /*
6862          * Copy path for transfers greater than id_tx_buf_sz
6863          */
6864         node->w_swr.wr_sgl = node->w_sgl;
6865         if (ibd_acquire_lsobufs(state, pktsize,
6866             node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6867                 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6868                 return (-1);
6869         }
6870         node->w_buftype = IBD_WQE_LSOBUF;
6871 
6872         /*
6873          * Copy the larger-than-id_tx_buf_sz packet into a set of
6874          * fixed-sized, pre-mapped LSO buffers. Note that we might
6875          * need to skip part of the LSO header in the first fragment
6876          * as before.
6877          */
6878         nmp = data_mp;
6879         skip = pending_hdr;
6880         for (i = 0; i < node->w_swr.wr_nds; i++) {
6881                 sgl = node->w_swr.wr_sgl + i;
6882                 bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6883                 avail = IBD_LSO_BUFSZ;
6884                 while (nmp && avail) {
6885                         blksize = MBLKL(nmp) - skip;
6886                         if (blksize > avail) {
6887                                 bcopy(nmp->b_rptr + skip, bufp, avail);
6888                                 skip += avail;
6889                                 avail = 0;
6890                         } else {
6891                                 bcopy(nmp->b_rptr + skip, bufp, blksize);
6892                                 skip = 0;
6893                                 avail -= blksize;
6894                                 bufp += blksize;
6895                                 nmp = nmp->b_cont;
6896                         }
6897                 }
6898         }
6899 
6900         return (0);
6901 }
6902 
6903 /*
6904  * Schedule a completion queue polling to reap the resource we're
6905  * short on.  If we implement the change to reap tx completions
6906  * in a separate thread, we'll need to wake up that thread here.
6907  */
6908 static int
6909 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6910 {
6911         ibd_req_t *req;
6912 
6913         mutex_enter(&state->id_sched_lock);
6914         state->id_sched_needed |= resource_type;
6915         mutex_exit(&state->id_sched_lock);
6916 
6917         /*
6918          * If we are asked to queue a work entry, we need to do it
6919          */
6920         if (q_flag) {
6921                 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6922                 if (req == NULL)
6923                         return (-1);
6924 
6925                 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6926         }
6927 
6928         return (0);
6929 }
6930 
6931 /*
6932  * The passed in packet has this format:
6933  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6934  */
6935 static boolean_t
6936 ibd_send(ibd_state_t *state, mblk_t *mp)
6937 {
6938         ibd_ace_t *ace;
6939         ibd_swqe_t *node;
6940         ipoib_mac_t *dest;
6941         ib_header_info_t *ipibp;
6942         ip6_t *ip6h;
6943         uint_t pktsize;
6944         uint32_t mss;
6945         uint32_t hckflags;
6946         uint32_t lsoflags = 0;
6947         uint_t lsohdr_sz = 0;
6948         int ret, len;
6949         boolean_t dofree = B_FALSE;
6950         boolean_t rc;
6951         /* if (rc_chan == NULL) send by UD; else send by RC; */
6952         ibd_rc_chan_t *rc_chan;
6953         int nmblks;
6954         mblk_t *nmp;
6955 
6956         /*
6957          * If we aren't done with the device initialization and start,
6958          * we shouldn't be here.
6959          */
6960         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6961                 return (B_FALSE);
6962 
6963         /*
6964          * Obtain an address handle for the destination.
6965          */
6966         ipibp = (ib_header_info_t *)mp->b_rptr;
6967         dest = (ipoib_mac_t *)&ipibp->ib_dst;
6968         if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6969                 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6970 
6971         rc_chan = NULL;
6972         ace = ibd_acache_lookup(state, dest, &ret, 1);
6973         if (state->id_enable_rc && (ace != NULL) &&
6974             (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6975                 if (ace->ac_chan == NULL) {
6976                         state->rc_null_conn++;
6977                 } else {
6978                         if (ace->ac_chan->chan_state ==
6979                             IBD_RC_STATE_ACT_ESTAB) {
6980                                 rc_chan = ace->ac_chan;
6981                                 rc_chan->is_used = B_TRUE;
6982                                 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6983                                 node = WQE_TO_SWQE(
6984                                     rc_chan->tx_wqe_list.dl_head);
6985                                 if (node != NULL) {
6986                                         rc_chan->tx_wqe_list.dl_cnt -= 1;
6987                                         rc_chan->tx_wqe_list.dl_head =
6988                                             node->swqe_next;
6989                                 } else {
6990                                         node = ibd_rc_acquire_swqes(rc_chan);
6991                                 }
6992                                 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6993 
6994                                 if (node == NULL) {
6995                                         state->rc_swqe_short++;
6996                                         mutex_enter(&state->id_sched_lock);
6997                                         state->id_sched_needed |=
6998                                             IBD_RSRC_RC_SWQE;
6999                                         mutex_exit(&state->id_sched_lock);
7000                                         ibd_dec_ref_ace(state, ace);
7001                                         return (B_FALSE);
7002                                 }
7003                         } else {
7004                                 state->rc_no_estab_conn++;
7005                         }
7006                 }
7007         }
7008 
7009         if (rc_chan == NULL) {
7010                 mutex_enter(&state->id_tx_list.dl_mutex);
7011                 node = WQE_TO_SWQE(state->id_tx_list.dl_head);
7012                 if (node != NULL) {
7013                         state->id_tx_list.dl_cnt -= 1;
7014                         state->id_tx_list.dl_head = node->swqe_next;
7015                 } else {
7016                         node = ibd_acquire_swqe(state);
7017                 }
7018                 mutex_exit(&state->id_tx_list.dl_mutex);
7019                 if (node == NULL) {
7020                         /*
7021                          * If we don't have an swqe available, schedule a
7022                          * transmit completion queue cleanup and hold off on
7023                          * sending more packets until we have some free swqes
7024                          */
7025                         if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
7026                                 if (ace != NULL) {
7027                                         ibd_dec_ref_ace(state, ace);
7028                                 }
7029                                 return (B_FALSE);
7030                         }
7031 
7032                         /*
7033                          * If a poll cannot be scheduled, we have no choice but
7034                          * to drop this packet
7035                          */
7036                         ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
7037                         if (ace != NULL) {
7038                                 ibd_dec_ref_ace(state, ace);
7039                         }
7040                         return (B_TRUE);
7041                 }
7042         }
7043 
7044         /*
7045          * Initialize the commonly used fields in swqe to NULL to protect
7046          * against ibd_tx_cleanup accidentally misinterpreting these on a
7047          * failure.
7048          */
7049         node->swqe_im_mblk = NULL;
7050         node->w_swr.wr_nds = 0;
7051         node->w_swr.wr_sgl = NULL;
7052         node->w_swr.wr_opcode = IBT_WRC_SEND;
7053 
7054         /*
7055          * Calculate the size of message data and number of msg blocks
7056          */
7057         pktsize = 0;
7058         for (nmblks = 0, nmp = mp; nmp != NULL;
7059             nmp = nmp->b_cont, nmblks++) {
7060                 pktsize += MBLKL(nmp);
7061         }
7062 
7063         if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7064                 atomic_inc_64(&state->id_brd_xmt);
7065         else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7066                 atomic_inc_64(&state->id_multi_xmt);
7067 
7068         if (ace != NULL) {
7069                 node->w_ahandle = ace;
7070                 node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7071         } else {
7072                 DPRINT(5,
7073                     "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7074                     ((ret == EFAULT) ? "failed" : "queued"),
7075                     htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7076                     htonl(dest->ipoib_gidpref[1]),
7077                     htonl(dest->ipoib_gidsuff[0]),
7078                     htonl(dest->ipoib_gidsuff[1]));
7079                 state->rc_ace_not_found++;
7080                 node->w_ahandle = NULL;
7081 
7082                 /*
7083                  * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7084                  * can not find a path for the specific dest address. We
7085                  * should get rid of this kind of packet.  We also should get
7086                  * rid of the packet if we cannot schedule a poll via the
7087                  * async thread.  For the normal case, ibd will return the
7088                  * packet to upper layer and wait for AH creating.
7089                  *
7090                  * Note that we always queue a work slot entry for the async
7091                  * thread when we fail AH lookup (even in intr mode); this is
7092                  * due to the convoluted way the code currently looks for AH.
7093                  */
7094                 if (ret == EFAULT) {
7095                         dofree = B_TRUE;
7096                         rc = B_TRUE;
7097                 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7098                         dofree = B_TRUE;
7099                         rc = B_TRUE;
7100                 } else {
7101                         dofree = B_FALSE;
7102                         rc = B_FALSE;
7103                 }
7104                 goto ibd_send_fail;
7105         }
7106 
7107         /*
7108          * For ND6 packets, padding is at the front of the source lladdr.
7109          * Insert the padding at front.
7110          */
7111         if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7112                 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7113                         if (!pullupmsg(mp, IPV6_HDR_LEN +
7114                             sizeof (ib_header_info_t))) {
7115                                 DPRINT(10, "ibd_send: pullupmsg failure ");
7116                                 dofree = B_TRUE;
7117                                 rc = B_TRUE;
7118                                 goto ibd_send_fail;
7119                         }
7120                         ipibp = (ib_header_info_t *)mp->b_rptr;
7121                 }
7122                 ip6h = (ip6_t *)((uchar_t *)ipibp +
7123                     sizeof (ib_header_info_t));
7124                 len = ntohs(ip6h->ip6_plen);
7125                 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7126                         mblk_t  *pad;
7127 
7128                         pad = allocb(4, 0);
7129                         pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7130                         linkb(mp, pad);
7131                         if (MBLKL(mp) < sizeof (ib_header_info_t) +
7132                             IPV6_HDR_LEN + len + 4) {
7133                                 if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7134                                     IPV6_HDR_LEN + len + 4)) {
7135                                         DPRINT(10, "ibd_send: pullupmsg "
7136                                             "failure ");
7137                                         dofree = B_TRUE;
7138                                         rc = B_TRUE;
7139                                         goto ibd_send_fail;
7140                                 }
7141                                 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7142                                     sizeof (ib_header_info_t));
7143                         }
7144 
7145                         /* LINTED: E_CONSTANT_CONDITION */
7146                         IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7147                 }
7148         }
7149 
7150         ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7151         mp->b_rptr += sizeof (ib_addrs_t);
7152         pktsize -= sizeof (ib_addrs_t);
7153 
7154         if (rc_chan) {  /* send in RC mode */
7155                 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7156                 ibt_iov_attr_t iov_attr;
7157                 uint_t          i;
7158                 size_t  blksize;
7159                 uchar_t *bufp;
7160                 ibd_rc_tx_largebuf_t *lbufp;
7161 
7162                 atomic_add_64(&state->rc_xmt_bytes, pktsize);
7163 
7164                 /*
7165                  * Upper layer does Tx checksum, we don't need do any
7166                  * checksum here.
7167                  */
7168                 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7169 
7170                 /*
7171                  * We only do ibt_map_mem_iov() if the pktsize is above
7172                  * the "copy-threshold", and if the number of mp
7173                  * fragments is less than the maximum acceptable.
7174                  */
7175                 if (pktsize <= state->id_rc_tx_copy_thresh) {
7176                         atomic_inc_64(&state->rc_xmt_small_pkt);
7177                         /*
7178                          * Only process unicast packet in Reliable Connected
7179                          * mode.
7180                          */
7181                         node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7182                         node->w_swr.wr_nds = 1;
7183                         node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7184                         node->w_buftype = IBD_WQE_TXBUF;
7185 
7186                         bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7187                         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7188                                 blksize = MBLKL(nmp);
7189                                 bcopy(nmp->b_rptr, bufp, blksize);
7190                                 bufp += blksize;
7191                         }
7192                         freemsg(mp);
7193                         ASSERT(node->swqe_im_mblk == NULL);
7194                 } else {
7195                         if ((state->rc_enable_iov_map) &&
7196                             (nmblks < state->rc_max_sqseg_hiwm)) {
7197 
7198                                 /* do ibt_map_mem_iov() */
7199                                 iov_attr.iov_as = NULL;
7200                                 iov_attr.iov = iov_arr;
7201                                 iov_attr.iov_buf = NULL;
7202                                 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7203                                 iov_attr.iov_lso_hdr_sz = 0;
7204                                 iov_attr.iov_flags = IBT_IOV_SLEEP;
7205 
7206                                 i = 0;
7207                                 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7208                                         iov_arr[i].iov_len = MBLKL(nmp);
7209                                         if (iov_arr[i].iov_len != 0) {
7210                                                 iov_arr[i].iov_addr = (caddr_t)
7211                                                     (void *)nmp->b_rptr;
7212                                                 i++;
7213                                         }
7214                                 }
7215                                 iov_attr.iov_list_len = i;
7216                                 node->w_swr.wr_sgl = node->w_sgl;
7217 
7218                                 ret = ibt_map_mem_iov(state->id_hca_hdl,
7219                                     &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7220                                     &node->w_mi_hdl);
7221                                 if (ret != IBT_SUCCESS) {
7222                                         atomic_inc_64(
7223                                             &state->rc_xmt_map_fail_pkt);
7224                                         DPRINT(30, "ibd_send: ibt_map_mem_iov("
7225                                             ") failed, nmblks=%d, real_nmblks"
7226                                             "=%d, ret=0x%x", nmblks, i, ret);
7227                                         goto ibd_rc_large_copy;
7228                                 }
7229 
7230                                 atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7231                                 node->w_buftype = IBD_WQE_MAPPED;
7232                                 node->swqe_im_mblk = mp;
7233                         } else {
7234                                 atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7235 ibd_rc_large_copy:
7236                                 mutex_enter(&state->rc_tx_large_bufs_lock);
7237                                 if (state->rc_tx_largebuf_nfree == 0) {
7238                                         state->rc_xmt_buf_short++;
7239                                         mutex_exit
7240                                             (&state->rc_tx_large_bufs_lock);
7241                                         mutex_enter(&state->id_sched_lock);
7242                                         state->id_sched_needed |=
7243                                             IBD_RSRC_RC_TX_LARGEBUF;
7244                                         mutex_exit(&state->id_sched_lock);
7245                                         dofree = B_FALSE;
7246                                         rc = B_FALSE;
7247                                         /*
7248                                          * If we don't have Tx large bufs,
7249                                          * return failure. node->w_buftype
7250                                          * should not be IBD_WQE_RC_COPYBUF,
7251                                          * otherwise it will cause problem
7252                                          * in ibd_rc_tx_cleanup()
7253                                          */
7254                                         node->w_buftype = IBD_WQE_TXBUF;
7255                                         goto ibd_send_fail;
7256                                 }
7257 
7258                                 lbufp = state->rc_tx_largebuf_free_head;
7259                                 ASSERT(lbufp->lb_buf != NULL);
7260                                 state->rc_tx_largebuf_free_head =
7261                                     lbufp->lb_next;
7262                                 lbufp->lb_next = NULL;
7263                                 /* Update nfree count */
7264                                 state->rc_tx_largebuf_nfree --;
7265                                 mutex_exit(&state->rc_tx_large_bufs_lock);
7266                                 bufp = lbufp->lb_buf;
7267                                 node->w_sgl[0].ds_va =
7268                                     (ib_vaddr_t)(uintptr_t)bufp;
7269                                 node->w_sgl[0].ds_key =
7270                                     state->rc_tx_mr_desc.md_lkey;
7271                                 node->w_sgl[0].ds_len = pktsize;
7272                                 node->w_swr.wr_sgl = node->w_sgl;
7273                                 node->w_swr.wr_nds = 1;
7274                                 node->w_buftype = IBD_WQE_RC_COPYBUF;
7275                                 node->w_rc_tx_largebuf = lbufp;
7276 
7277                                 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7278                                         blksize = MBLKL(nmp);
7279                                         if (blksize != 0) {
7280                                                 bcopy(nmp->b_rptr, bufp,
7281                                                     blksize);
7282                                                 bufp += blksize;
7283                                         }
7284                                 }
7285                                 freemsg(mp);
7286                                 ASSERT(node->swqe_im_mblk == NULL);
7287                         }
7288                 }
7289 
7290                 node->swqe_next = NULL;
7291                 mutex_enter(&rc_chan->tx_post_lock);
7292                 if (rc_chan->tx_busy) {
7293                         if (rc_chan->tx_head) {
7294                                 rc_chan->tx_tail->swqe_next =
7295                                     SWQE_TO_WQE(node);
7296                         } else {
7297                                 rc_chan->tx_head = node;
7298                         }
7299                         rc_chan->tx_tail = node;
7300                         mutex_exit(&rc_chan->tx_post_lock);
7301                 } else {
7302                         rc_chan->tx_busy = 1;
7303                         mutex_exit(&rc_chan->tx_post_lock);
7304                         ibd_rc_post_send(rc_chan, node);
7305                 }
7306 
7307                 return (B_TRUE);
7308         } /* send by RC */
7309 
7310         if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7311                 /*
7312                  * Too long pktsize. The packet size from GLD should <=
7313                  * state->id_mtu + sizeof (ib_addrs_t)
7314                  */
7315                 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7316                         ibd_req_t *req;
7317 
7318                         mutex_enter(&ace->tx_too_big_mutex);
7319                         if (ace->tx_too_big_ongoing) {
7320                                 mutex_exit(&ace->tx_too_big_mutex);
7321                                 state->rc_xmt_reenter_too_long_pkt++;
7322                                 dofree = B_TRUE;
7323                         } else {
7324                                 ace->tx_too_big_ongoing = B_TRUE;
7325                                 mutex_exit(&ace->tx_too_big_mutex);
7326                                 state->rc_xmt_icmp_too_long_pkt++;
7327 
7328                                 req = kmem_cache_alloc(state->id_req_kmc,
7329                                     KM_NOSLEEP);
7330                                 if (req == NULL) {
7331                                         ibd_print_warn(state, "ibd_send: alloc "
7332                                             "ibd_req_t fail");
7333                                         /* Drop it. */
7334                                         dofree = B_TRUE;
7335                                 } else {
7336                                         req->rq_ptr = mp;
7337                                         req->rq_ptr2 = ace;
7338                                         ibd_queue_work_slot(state, req,
7339                                             IBD_ASYNC_RC_TOO_BIG);
7340                                         dofree = B_FALSE;
7341                                 }
7342                         }
7343                 } else {
7344                         ibd_print_warn(state, "Reliable Connected mode is on. "
7345                             "Multicast packet length %d > %d is too long to "
7346                             "send packet (%d > %d), drop it",
7347                             pktsize, state->id_mtu);
7348                         state->rc_xmt_drop_too_long_pkt++;
7349                         /* Drop it. */
7350                         dofree = B_TRUE;
7351                 }
7352                 rc = B_TRUE;
7353                 goto ibd_send_fail;
7354         }
7355 
7356         atomic_add_64(&state->id_xmt_bytes, pktsize);
7357         atomic_inc_64(&state->id_xmt_pkt);
7358 
7359         /*
7360          * Do LSO and checksum related work here.  For LSO send, adjust the
7361          * ud destination, the opcode and the LSO header information to the
7362          * work request.
7363          */
7364         mac_lso_get(mp, &mss, &lsoflags);
7365         if ((lsoflags & HW_LSO) != HW_LSO) {
7366                 node->w_swr.wr_opcode = IBT_WRC_SEND;
7367                 lsohdr_sz = 0;
7368         } else {
7369                 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7370                         /*
7371                          * The routine can only fail if there's no memory; we
7372                          * can only drop the packet if this happens
7373                          */
7374                         ibd_print_warn(state,
7375                             "ibd_send: no memory, lso posting failed");
7376                         dofree = B_TRUE;
7377                         rc = B_TRUE;
7378                         goto ibd_send_fail;
7379                 }
7380 
7381                 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7382                 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7383         }
7384 
7385         mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7386         if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7387                 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7388         else
7389                 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7390 
7391         /*
7392          * Prepare the sgl for posting; the routine can only fail if there's
7393          * no lso buf available for posting. If this is the case, we should
7394          * probably resched for lso bufs to become available and then try again.
7395          */
7396         if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7397                 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7398                         dofree = B_TRUE;
7399                         rc = B_TRUE;
7400                 } else {
7401                         dofree = B_FALSE;
7402                         rc = B_FALSE;
7403                 }
7404                 goto ibd_send_fail;
7405         }
7406         node->swqe_im_mblk = mp;
7407 
7408         /*
7409          * Queue the wqe to hardware; since we can now simply queue a
7410          * post instead of doing it serially, we cannot assume anything
7411          * about the 'node' after ibd_post_send() returns.
7412          */
7413         node->swqe_next = NULL;
7414 
7415         mutex_enter(&state->id_txpost_lock);
7416         if (state->id_tx_busy) {
7417                 if (state->id_tx_head) {
7418                         state->id_tx_tail->swqe_next =
7419                             SWQE_TO_WQE(node);
7420                 } else {
7421                         state->id_tx_head = node;
7422                 }
7423                 state->id_tx_tail = node;
7424                 mutex_exit(&state->id_txpost_lock);
7425         } else {
7426                 state->id_tx_busy = 1;
7427                 mutex_exit(&state->id_txpost_lock);
7428                 ibd_post_send(state, node);
7429         }
7430 
7431         return (B_TRUE);
7432 
7433 ibd_send_fail:
7434         if (node && mp)
7435                 ibd_free_lsohdr(node, mp);
7436 
7437         if (dofree)
7438                 freemsg(mp);
7439 
7440         if (node != NULL) {
7441                 if (rc_chan) {
7442                         ibd_rc_tx_cleanup(node);
7443                 } else {
7444                         ibd_tx_cleanup(state, node);
7445                 }
7446         }
7447 
7448         return (rc);
7449 }
7450 
7451 /*
7452  * GLDv3 entry point for transmitting datagram.
7453  */
7454 static mblk_t *
7455 ibd_m_tx(void *arg, mblk_t *mp)
7456 {
7457         ibd_state_t *state = (ibd_state_t *)arg;
7458         mblk_t *next;
7459 
7460         if (state->id_type == IBD_PORT_DRIVER) {
7461                 freemsgchain(mp);
7462                 return (NULL);
7463         }
7464 
7465         if ((state->id_link_state != LINK_STATE_UP) ||
7466             !(state->id_mac_state & IBD_DRV_STARTED)) {
7467                 freemsgchain(mp);
7468                 mp = NULL;
7469         }
7470 
7471         while (mp != NULL) {
7472                 next = mp->b_next;
7473                 mp->b_next = NULL;
7474                 if (ibd_send(state, mp) == B_FALSE) {
7475                         /* Send fail */
7476                         mp->b_next = next;
7477                         break;
7478                 }
7479                 mp = next;
7480         }
7481 
7482         return (mp);
7483 }
7484 
7485 /*
7486  * this handles Tx and Rx completions. With separate CQs, this handles
7487  * only Rx completions.
7488  */
7489 static uint_t
7490 ibd_intr(caddr_t arg)
7491 {
7492         ibd_state_t *state = (ibd_state_t *)arg;
7493 
7494         ibd_poll_rcq(state, state->id_rcq_hdl);
7495 
7496         return (DDI_INTR_CLAIMED);
7497 }
7498 
7499 /*
7500  * Poll and fully drain the send cq
7501  */
7502 static void
7503 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7504 {
7505         ibt_wc_t *wcs = state->id_txwcs;
7506         uint_t numwcs = state->id_txwcs_size;
7507         ibd_wqe_t *wqe;
7508         ibd_swqe_t *head, *tail;
7509         ibt_wc_t *wc;
7510         uint_t num_polled;
7511         int i;
7512 
7513         while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7514                 head = tail = NULL;
7515                 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7516                         wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7517                         if (wc->wc_status != IBT_WC_SUCCESS) {
7518                                 /*
7519                                  * Channel being torn down.
7520                                  */
7521                                 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7522                                         DPRINT(5, "ibd_drain_scq: flush error");
7523                                         DPRINT(10, "ibd_drain_scq: Bad "
7524                                             "status %d", wc->wc_status);
7525                                 } else {
7526                                         DPRINT(10, "ibd_drain_scq: "
7527                                             "unexpected wc_status %d",
7528                                             wc->wc_status);
7529                                 }
7530                                 /*
7531                                  * Fallthrough to invoke the Tx handler to
7532                                  * release held resources, e.g., AH refcount.
7533                                  */
7534                         }
7535                         /*
7536                          * Add this swqe to the list to be cleaned up.
7537                          */
7538                         if (head)
7539                                 tail->swqe_next = wqe;
7540                         else
7541                                 head = WQE_TO_SWQE(wqe);
7542                         tail = WQE_TO_SWQE(wqe);
7543                 }
7544                 tail->swqe_next = NULL;
7545                 ibd_tx_cleanup_list(state, head, tail);
7546 
7547                 /*
7548                  * Resume any blocked transmissions if possible
7549                  */
7550                 ibd_resume_transmission(state);
7551         }
7552 }
7553 
7554 /*
7555  * Poll and fully drain the receive cq
7556  */
7557 static void
7558 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7559 {
7560         ibt_wc_t *wcs = state->id_rxwcs;
7561         uint_t numwcs = state->id_rxwcs_size;
7562         ibd_rwqe_t *rwqe;
7563         ibt_wc_t *wc;
7564         uint_t num_polled;
7565         int i;
7566         mblk_t *head, *tail, *mp;
7567 
7568         while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7569                 head = tail = NULL;
7570                 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7571                         rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7572                         if (wc->wc_status != IBT_WC_SUCCESS) {
7573                                 /*
7574                                  * Channel being torn down.
7575                                  */
7576                                 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7577                                         DPRINT(5, "ibd_drain_rcq: "
7578                                             "expected flushed rwqe");
7579                                 } else {
7580                                         DPRINT(5, "ibd_drain_rcq: "
7581                                             "unexpected wc_status %d",
7582                                             wc->wc_status);
7583                                 }
7584                                 atomic_inc_32(
7585                                     &state->id_rx_list.dl_bufs_outstanding);
7586                                 freemsg(rwqe->rwqe_im_mblk);
7587                                 continue;
7588                         }
7589                         mp = ibd_process_rx(state, rwqe, wc);
7590                         if (mp == NULL)
7591                                 continue;
7592 
7593                         /*
7594                          * Add this mp to the list to send to the nw layer.
7595                          */
7596                         if (head)
7597                                 tail->b_next = mp;
7598                         else
7599                                 head = mp;
7600                         tail = mp;
7601                 }
7602                 if (head)
7603                         mac_rx(state->id_mh, state->id_rh, head);
7604 
7605                 /*
7606                  * Account for #rwqes polled.
7607                  * Post more here, if less than one fourth full.
7608                  */
7609                 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7610                     (state->id_ud_num_rwqe / 4))
7611                         ibd_post_recv_intr(state);
7612         }
7613 }
7614 
7615 /*
7616  * Common code for interrupt handling as well as for polling
7617  * for all completed wqe's while detaching.
7618  */
7619 static void
7620 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7621 {
7622         int flag, redo_flag;
7623         int redo = 1;
7624 
7625         flag = IBD_CQ_POLLING;
7626         redo_flag = IBD_REDO_CQ_POLLING;
7627 
7628         mutex_enter(&state->id_scq_poll_lock);
7629         if (state->id_scq_poll_busy & flag) {
7630                 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7631                 state->id_scq_poll_busy |= redo_flag;
7632                 mutex_exit(&state->id_scq_poll_lock);
7633                 return;
7634         }
7635         state->id_scq_poll_busy |= flag;
7636         mutex_exit(&state->id_scq_poll_lock);
7637 
7638         /*
7639          * In some cases (eg detaching), this code can be invoked on
7640          * any cpu after disabling cq notification (thus no concurrency
7641          * exists). Apart from that, the following applies normally:
7642          * Transmit completion handling could be from any cpu if
7643          * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7644          * is interrupt driven.
7645          */
7646 
7647         /*
7648          * Poll and drain the CQ
7649          */
7650         ibd_drain_scq(state, cq_hdl);
7651 
7652         /*
7653          * Enable CQ notifications and redrain the cq to catch any
7654          * completions we might have missed after the ibd_drain_scq()
7655          * above and before the ibt_enable_cq_notify() that follows.
7656          * Finally, service any new requests to poll the cq that
7657          * could've come in after the ibt_enable_cq_notify().
7658          */
7659         do {
7660                 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7661                     IBT_SUCCESS) {
7662                         DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7663                 }
7664 
7665                 ibd_drain_scq(state, cq_hdl);
7666 
7667                 mutex_enter(&state->id_scq_poll_lock);
7668                 if (state->id_scq_poll_busy & redo_flag)
7669                         state->id_scq_poll_busy &= ~redo_flag;
7670                 else {
7671                         state->id_scq_poll_busy &= ~flag;
7672                         redo = 0;
7673                 }
7674                 mutex_exit(&state->id_scq_poll_lock);
7675 
7676         } while (redo);
7677 }
7678 
7679 /*
7680  * Common code for interrupt handling as well as for polling
7681  * for all completed wqe's while detaching.
7682  */
7683 static void
7684 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7685 {
7686         int flag, redo_flag;
7687         int redo = 1;
7688 
7689         flag = IBD_CQ_POLLING;
7690         redo_flag = IBD_REDO_CQ_POLLING;
7691 
7692         mutex_enter(&state->id_rcq_poll_lock);
7693         if (state->id_rcq_poll_busy & flag) {
7694                 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7695                 state->id_rcq_poll_busy |= redo_flag;
7696                 mutex_exit(&state->id_rcq_poll_lock);
7697                 return;
7698         }
7699         state->id_rcq_poll_busy |= flag;
7700         mutex_exit(&state->id_rcq_poll_lock);
7701 
7702         /*
7703          * Poll and drain the CQ
7704          */
7705         ibd_drain_rcq(state, rcq);
7706 
7707         /*
7708          * Enable CQ notifications and redrain the cq to catch any
7709          * completions we might have missed after the ibd_drain_cq()
7710          * above and before the ibt_enable_cq_notify() that follows.
7711          * Finally, service any new requests to poll the cq that
7712          * could've come in after the ibt_enable_cq_notify().
7713          */
7714         do {
7715                 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7716                     IBT_SUCCESS) {
7717                         DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7718                 }
7719 
7720                 ibd_drain_rcq(state, rcq);
7721 
7722                 mutex_enter(&state->id_rcq_poll_lock);
7723                 if (state->id_rcq_poll_busy & redo_flag)
7724                         state->id_rcq_poll_busy &= ~redo_flag;
7725                 else {
7726                         state->id_rcq_poll_busy &= ~flag;
7727                         redo = 0;
7728                 }
7729                 mutex_exit(&state->id_rcq_poll_lock);
7730 
7731         } while (redo);
7732 }
7733 
7734 /*
7735  * Unmap the memory area associated with a given swqe.
7736  */
7737 void
7738 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7739 {
7740         ibt_status_t stat;
7741 
7742         DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7743 
7744         if (swqe->w_mi_hdl) {
7745                 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7746                     swqe->w_mi_hdl)) != IBT_SUCCESS) {
7747                         DPRINT(10,
7748                             "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7749                 }
7750                 swqe->w_mi_hdl = NULL;
7751         }
7752         swqe->w_swr.wr_nds = 0;
7753 }
7754 
7755 void
7756 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7757 {
7758         /*
7759          * The recycling logic can be eliminated from here
7760          * and put into the async thread if we create another
7761          * list to hold ACE's for unjoined mcg's.
7762          */
7763         if (DEC_REF_DO_CYCLE(ace)) {
7764                 ibd_mce_t *mce;
7765 
7766                 /*
7767                  * Check with the lock taken: we decremented
7768                  * reference count without the lock, and some
7769                  * transmitter might already have bumped the
7770                  * reference count (possible in case of multicast
7771                  * disable when we leave the AH on the active
7772                  * list). If not still 0, get out, leaving the
7773                  * recycle bit intact.
7774                  *
7775                  * Atomically transition the AH from active
7776                  * to free list, and queue a work request to
7777                  * leave the group and destroy the mce. No
7778                  * transmitter can be looking at the AH or
7779                  * the MCE in between, since we have the
7780                  * ac_mutex lock. In the SendOnly reap case,
7781                  * it is not necessary to hold the ac_mutex
7782                  * and recheck the ref count (since the AH was
7783                  * taken off the active list), we just do it
7784                  * to have uniform processing with the Full
7785                  * reap case.
7786                  */
7787                 mutex_enter(&state->id_ac_mutex);
7788                 mce = ace->ac_mce;
7789                 if (GET_REF_CYCLE(ace) == 0) {
7790                         CLEAR_REFCYCLE(ace);
7791                         /*
7792                          * Identify the case of fullmember reap as
7793                          * opposed to mcg trap reap. Also, port up
7794                          * might set ac_mce to NULL to indicate Tx
7795                          * cleanup should do no more than put the
7796                          * AH in the free list (see ibd_async_link).
7797                          */
7798                         if (mce != NULL) {
7799                                 ace->ac_mce = NULL;
7800                                 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7801                                 /*
7802                                  * mc_req was initialized at mce
7803                                  * creation time.
7804                                  */
7805                                 ibd_queue_work_slot(state,
7806                                     &mce->mc_req, IBD_ASYNC_REAP);
7807                         }
7808                         IBD_ACACHE_INSERT_FREE(state, ace);
7809                 }
7810                 mutex_exit(&state->id_ac_mutex);
7811         }
7812 }
7813 
7814 /*
7815  * Common code that deals with clean ups after a successful or
7816  * erroneous transmission attempt.
7817  */
7818 static void
7819 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7820 {
7821         ibd_ace_t *ace = swqe->w_ahandle;
7822 
7823         DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7824 
7825         /*
7826          * If this was a dynamic mapping in ibd_send(), we need to
7827          * unmap here. If this was an lso buffer we'd used for sending,
7828          * we need to release the lso buf to the pool, since the resource
7829          * is scarce. However, if this was simply a normal send using
7830          * the copybuf (present in each swqe), we don't need to release it.
7831          */
7832         if (swqe->swqe_im_mblk != NULL) {
7833                 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7834                         ibd_unmap_mem(state, swqe);
7835                 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7836                         ibd_release_lsobufs(state,
7837                             swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7838                 }
7839                 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7840                 freemsg(swqe->swqe_im_mblk);
7841                 swqe->swqe_im_mblk = NULL;
7842         }
7843 
7844         /*
7845          * Drop the reference count on the AH; it can be reused
7846          * now for a different destination if there are no more
7847          * posted sends that will use it. This can be eliminated
7848          * if we can always associate each Tx buffer with an AH.
7849          * The ace can be null if we are cleaning up from the
7850          * ibd_send() error path.
7851          */
7852         if (ace != NULL) {
7853                 ibd_dec_ref_ace(state, ace);
7854         }
7855 
7856         /*
7857          * Release the send wqe for reuse.
7858          */
7859         swqe->swqe_next = NULL;
7860         ibd_release_swqe(state, swqe, swqe, 1);
7861 }
7862 
7863 static void
7864 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7865 {
7866         ibd_ace_t *ace;
7867         ibd_swqe_t *swqe;
7868         int n = 0;
7869 
7870         DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7871 
7872         for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7873 
7874                 /*
7875                  * If this was a dynamic mapping in ibd_send(), we need to
7876                  * unmap here. If this was an lso buffer we'd used for sending,
7877                  * we need to release the lso buf to the pool, since the
7878                  * resource is scarce. However, if this was simply a normal
7879                  * send using the copybuf (present in each swqe), we don't need
7880                  * to release it.
7881                  */
7882                 if (swqe->swqe_im_mblk != NULL) {
7883                         if (swqe->w_buftype == IBD_WQE_MAPPED) {
7884                                 ibd_unmap_mem(state, swqe);
7885                         } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7886                                 ibd_release_lsobufs(state,
7887                                     swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7888                         }
7889                         ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7890                         freemsg(swqe->swqe_im_mblk);
7891                         swqe->swqe_im_mblk = NULL;
7892                 }
7893 
7894                 /*
7895                  * Drop the reference count on the AH; it can be reused
7896                  * now for a different destination if there are no more
7897                  * posted sends that will use it. This can be eliminated
7898                  * if we can always associate each Tx buffer with an AH.
7899                  * The ace can be null if we are cleaning up from the
7900                  * ibd_send() error path.
7901                  */
7902                 ace = swqe->w_ahandle;
7903                 if (ace != NULL) {
7904                         ibd_dec_ref_ace(state, ace);
7905                 }
7906                 n++;
7907         }
7908 
7909         /*
7910          * Release the send wqes for reuse.
7911          */
7912         ibd_release_swqe(state, head, tail, n);
7913 }
7914 
7915 /*
7916  * Processing to be done after receipt of a packet; hand off to GLD
7917  * in the format expected by GLD.  The received packet has this
7918  * format: 2b sap :: 00 :: data.
7919  */
7920 static mblk_t *
7921 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7922 {
7923         ib_header_info_t *phdr;
7924         mblk_t *mp;
7925         ipoib_hdr_t *ipibp;
7926         ipha_t *iphap;
7927         ip6_t *ip6h;
7928         int len;
7929         ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7930         uint32_t bufs;
7931 
7932         /*
7933          * Track number handed to upper layer that need to be returned.
7934          */
7935         bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7936 
7937         /* Never run out of rwqes, use allocb when running low */
7938         if (bufs >= state->id_rx_bufs_outstanding_limit) {
7939                 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7940                 atomic_inc_32(&state->id_rx_allocb);
7941                 mp = allocb(pkt_len, BPRI_HI);
7942                 if (mp) {
7943                         bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7944                         ibd_post_recv(state, rwqe);
7945                 } else {        /* no memory */
7946                         atomic_inc_32(&state->id_rx_allocb_failed);
7947                         ibd_post_recv(state, rwqe);
7948                         return (NULL);
7949                 }
7950         } else {
7951                 mp = rwqe->rwqe_im_mblk;
7952         }
7953 
7954 
7955         /*
7956          * Adjust write pointer depending on how much data came in.
7957          */
7958         mp->b_wptr = mp->b_rptr + pkt_len;
7959 
7960         /*
7961          * Make sure this is NULL or we're in trouble.
7962          */
7963         if (mp->b_next != NULL) {
7964                 ibd_print_warn(state,
7965                     "ibd_process_rx: got duplicate mp from rcq?");
7966                 mp->b_next = NULL;
7967         }
7968 
7969         /*
7970          * the IB link will deliver one of the IB link layer
7971          * headers called, the Global Routing Header (GRH).
7972          * ibd driver uses the information in GRH to build the
7973          * Header_info structure and pass it with the datagram up
7974          * to GLDv3.
7975          * If the GRH is not valid, indicate to GLDv3 by setting
7976          * the VerTcFlow field to 0.
7977          */
7978         phdr = (ib_header_info_t *)mp->b_rptr;
7979         if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7980                 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7981 
7982                 /* if it is loop back packet, just drop it. */
7983                 if (state->id_enable_rc) {
7984                         if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7985                             &state->rc_macaddr_loopback,
7986                             IPOIB_ADDRL) == 0) {
7987                                 freemsg(mp);
7988                                 return (NULL);
7989                         }
7990                 } else {
7991                         if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7992                             IPOIB_ADDRL) == 0) {
7993                                 freemsg(mp);
7994                                 return (NULL);
7995                         }
7996                 }
7997 
7998                 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7999                     sizeof (ipoib_mac_t));
8000                 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
8001                         phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
8002                         IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
8003                 } else {
8004                         phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
8005                 }
8006         } else {
8007                 /*
8008                  * It can not be a IBA multicast packet. Must have been
8009                  * unicast for us. Just copy the interface address to dst.
8010                  */
8011                 phdr->ib_grh.ipoib_vertcflow = 0;
8012                 ovbcopy(&state->id_macaddr, &phdr->ib_dst,
8013                     sizeof (ipoib_mac_t));
8014         }
8015 
8016         /*
8017          * For ND6 packets, padding is at the front of the source/target
8018          * lladdr. However the inet6 layer is not aware of it, hence remove
8019          * the padding from such packets.
8020          */
8021         ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
8022         if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
8023                 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8024                 len = ntohs(ip6h->ip6_plen);
8025                 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8026                         /* LINTED: E_CONSTANT_CONDITION */
8027                         IBD_PAD_NSNA(ip6h, len, IBD_RECV);
8028                 }
8029         }
8030 
8031         /*
8032          * Update statistics
8033          */
8034         atomic_add_64(&state->id_rcv_bytes, pkt_len);
8035         atomic_inc_64(&state->id_rcv_pkt);
8036         if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
8037                 atomic_inc_64(&state->id_brd_rcv);
8038         else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
8039                 atomic_inc_64(&state->id_multi_rcv);
8040 
8041         iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8042         /*
8043          * Set receive checksum status in mp
8044          * Hardware checksumming can be considered valid only if:
8045          * 1. CQE.IP_OK bit is set
8046          * 2. CQE.CKSUM = 0xffff
8047          * 3. IPv6 routing header is not present in the packet
8048          * 4. If there are no IP_OPTIONS in the IP HEADER
8049          */
8050 
8051         if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
8052             (wc->wc_cksum == 0xFFFF) &&
8053             (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
8054                 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
8055         }
8056 
8057         return (mp);
8058 }
8059 
8060 /*
8061  * Callback code invoked from STREAMs when the receive data buffer is
8062  * free for recycling.
8063  */
8064 static void
8065 ibd_freemsg_cb(char *arg)
8066 {
8067         ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8068         ibd_state_t *state = rwqe->w_state;
8069 
8070         atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8071 
8072         /*
8073          * If the driver is stopped, just free the rwqe.
8074          */
8075         if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8076                 DPRINT(6, "ibd_freemsg: wqe being freed");
8077                 rwqe->rwqe_im_mblk = NULL;
8078                 ibd_free_rwqe(state, rwqe);
8079                 return;
8080         }
8081 
8082         rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8083             state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8084         if (rwqe->rwqe_im_mblk == NULL) {
8085                 ibd_free_rwqe(state, rwqe);
8086                 DPRINT(6, "ibd_freemsg: desballoc failed");
8087                 return;
8088         }
8089 
8090         ibd_post_recv(state, rwqe);
8091 }
8092 
8093 static uint_t
8094 ibd_tx_recycle(caddr_t arg)
8095 {
8096         ibd_state_t *state = (ibd_state_t *)arg;
8097 
8098         /*
8099          * Poll for completed entries
8100          */
8101         ibd_poll_scq(state, state->id_scq_hdl);
8102 
8103         return (DDI_INTR_CLAIMED);
8104 }
8105 
8106 #ifdef IBD_LOGGING
8107 static void
8108 ibd_log_init(void)
8109 {
8110         ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8111         ibd_lbuf_ndx = 0;
8112 
8113         mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8114 }
8115 
8116 static void
8117 ibd_log_fini(void)
8118 {
8119         if (ibd_lbuf)
8120                 kmem_free(ibd_lbuf, IBD_LOG_SZ);
8121         ibd_lbuf_ndx = 0;
8122         ibd_lbuf = NULL;
8123 
8124         mutex_destroy(&ibd_lbuf_lock);
8125 }
8126 
8127 static void
8128 ibd_log(const char *fmt, ...)
8129 {
8130         va_list ap;
8131         uint32_t off;
8132         uint32_t msglen;
8133         char tmpbuf[IBD_DMAX_LINE];
8134 
8135         if (ibd_lbuf == NULL)
8136                 return;
8137 
8138         va_start(ap, fmt);
8139         msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8140         va_end(ap);
8141 
8142         if (msglen >= IBD_DMAX_LINE)
8143                 msglen = IBD_DMAX_LINE - 1;
8144 
8145         mutex_enter(&ibd_lbuf_lock);
8146 
8147         off = ibd_lbuf_ndx;             /* current msg should go here */
8148         if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8149                 ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8150 
8151         ibd_lbuf_ndx += msglen;         /* place where next msg should start */
8152         ibd_lbuf[ibd_lbuf_ndx] = 0;     /* current msg should terminate */
8153 
8154         if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8155                 ibd_lbuf_ndx = 0;
8156 
8157         mutex_exit(&ibd_lbuf_lock);
8158 
8159         bcopy(tmpbuf, ibd_lbuf+off, msglen);    /* no lock needed for this */
8160 }
8161 #endif
8162 
8163 /* ARGSUSED */
8164 static int
8165 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8166     int *rvalp)
8167 {
8168         ibd_create_ioctl_t      *cmd = karg;
8169         ibd_state_t             *state, *port_state, *p;
8170         int                     i, err, rval = 0;
8171         mac_register_t          *macp;
8172         ibt_hca_portinfo_t      *pinfop = NULL;
8173         ibt_status_t            ibt_status;
8174         uint_t                  psize, pinfosz;
8175         boolean_t               force_create = B_FALSE;
8176 
8177         cmd->ibdioc.ioc_status = 0;
8178 
8179         if (cmd->ibdioc.ioc_port_inst < 0) {
8180                 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8181                 return (EINVAL);
8182         }
8183         port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8184         if (port_state == NULL) {
8185                 DPRINT(10, "ibd_create_partition: failed to get state %d",
8186                     cmd->ibdioc.ioc_port_inst);
8187                 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8188                 return (EINVAL);
8189         }
8190 
8191         /* Limited PKeys not supported */
8192         if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8193                 rval = EINVAL;
8194                 goto part_create_return;
8195         }
8196 
8197         if (cmd->ioc_force_create == 0) {
8198                 /*
8199                  * Check if the port pkey table contains the pkey for which
8200                  * this partition is being created.
8201                  */
8202                 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8203                     port_state->id_port, &pinfop, &psize, &pinfosz);
8204 
8205                 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8206                         rval = EINVAL;
8207                         goto part_create_return;
8208                 }
8209 
8210                 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8211                         rval = ENETDOWN;
8212                         cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8213                         goto part_create_return;
8214                 }
8215 
8216                 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8217                         if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8218                                 break;
8219                         }
8220                 }
8221                 if (i == pinfop->p_pkey_tbl_sz) {
8222                         rval = EINVAL;
8223                         cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8224                         goto part_create_return;
8225                 }
8226         } else {
8227                 force_create = B_TRUE;
8228         }
8229 
8230         mutex_enter(&ibd_objlist_lock);
8231         for (p = ibd_objlist_head; p; p = p->id_next) {
8232                 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8233                     (p->id_pkey == cmd->ioc_pkey) &&
8234                     (p->id_plinkid == cmd->ioc_partid)) {
8235                         mutex_exit(&ibd_objlist_lock);
8236                         rval = EEXIST;
8237                         cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8238                         goto part_create_return;
8239                 }
8240         }
8241         mutex_exit(&ibd_objlist_lock);
8242 
8243         state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8244 
8245         state->id_type               = IBD_PARTITION_OBJ;
8246 
8247         state->id_plinkid    = cmd->ioc_partid;
8248         state->id_dlinkid    = cmd->ibdioc.ioc_linkid;
8249         state->id_port_inst  = cmd->ibdioc.ioc_port_inst;
8250 
8251         state->id_dip                = port_state->id_dip;
8252         state->id_port               = port_state->id_port;
8253         state->id_pkey               = cmd->ioc_pkey;
8254         state->id_hca_guid   = port_state->id_hca_guid;
8255         state->id_port_guid  = port_state->id_port_guid;
8256         state->id_force_create       = force_create;
8257 
8258         mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8259         cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8260 
8261         if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8262                 rval = EIO;
8263                 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8264                 goto fail;
8265         }
8266 
8267         if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8268                 rval = EAGAIN;
8269                 goto fail;
8270         }
8271 
8272         macp->m_type_ident   = MAC_PLUGIN_IDENT_IB;
8273         macp->m_dip          = port_state->id_dip;
8274         macp->m_instance     = (uint_t)-1;
8275         macp->m_driver               = state;
8276         macp->m_src_addr     = (uint8_t *)&state->id_macaddr;
8277         macp->m_callbacks    = &ibd_m_callbacks;
8278         macp->m_min_sdu              = 0;
8279         macp->m_multicast_sdu        = IBD_DEF_MAX_SDU;
8280         if (state->id_enable_rc) {
8281                 macp->m_max_sdu              = IBD_DEF_RC_MAX_SDU;
8282         } else {
8283                 macp->m_max_sdu              = IBD_DEF_MAX_SDU;
8284         }
8285         macp->m_priv_props = ibd_priv_props;
8286 
8287         err = mac_register(macp, &state->id_mh);
8288         mac_free(macp);
8289 
8290         if (err != 0) {
8291                 DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8292                     err);
8293                 rval = err;
8294                 goto fail;
8295         }
8296 
8297         err = dls_devnet_create(state->id_mh,
8298             cmd->ioc_partid, crgetzoneid(credp));
8299         if (err != 0) {
8300                 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8301                     "%d", err);
8302                 rval = err;
8303                 (void) mac_unregister(state->id_mh);
8304                 goto fail;
8305         }
8306 
8307         /*
8308          * Add the new partition state structure to the list
8309          */
8310         mutex_enter(&ibd_objlist_lock);
8311         if (ibd_objlist_head)
8312                 state->id_next = ibd_objlist_head;
8313 
8314         ibd_objlist_head = state;
8315         mutex_exit(&ibd_objlist_lock);
8316 
8317 part_create_return:
8318         if (pinfop) {
8319                 ibt_free_portinfo(pinfop, pinfosz);
8320         }
8321         return (rval);
8322 
8323 fail:
8324         if (pinfop) {
8325                 ibt_free_portinfo(pinfop, pinfosz);
8326         }
8327         ibd_part_unattach(state);
8328         kmem_free(state, sizeof (ibd_state_t));
8329         return (rval);
8330 }
8331 
8332 /* ARGSUSED */
8333 static int
8334 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8335     int *rvalp)
8336 {
8337         int err;
8338         datalink_id_t tmpid;
8339         ibd_state_t *node, *prev;
8340         ibd_delete_ioctl_t *cmd = karg;
8341 
8342         prev = NULL;
8343 
8344         mutex_enter(&ibd_objlist_lock);
8345         node = ibd_objlist_head;
8346 
8347         /* Find the ibd state structure corresponding to the partition */
8348         while (node != NULL) {
8349                 if (node->id_plinkid == cmd->ioc_partid)
8350                         break;
8351                 prev = node;
8352                 node = node->id_next;
8353         }
8354 
8355         if (node == NULL) {
8356                 mutex_exit(&ibd_objlist_lock);
8357                 return (ENOENT);
8358         }
8359 
8360         if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8361                 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8362                     "%d", err);
8363                 mutex_exit(&ibd_objlist_lock);
8364                 return (err);
8365         }
8366 
8367         /*
8368          * Call ibd_part_unattach() only after making sure that the instance has
8369          * not been started yet and is also not in late hca init mode.
8370          */
8371         ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8372 
8373         err = 0;
8374         if ((node->id_mac_state & IBD_DRV_STARTED) ||
8375             (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8376             (ibd_part_busy(node) != DDI_SUCCESS) ||
8377             ((err = mac_disable(node->id_mh)) != 0)) {
8378                 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8379                     crgetzoneid(credp));
8380                 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8381                 mutex_exit(&ibd_objlist_lock);
8382                 return (err != 0 ? err : EBUSY);
8383         }
8384 
8385         node->id_mac_state |= IBD_DRV_IN_DELETION;
8386 
8387         ibd_part_unattach(node);
8388 
8389         ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8390 
8391         /* Remove the partition state structure from the linked list */
8392         if (prev == NULL)
8393                 ibd_objlist_head = node->id_next;
8394         else
8395                 prev->id_next = node->id_next;
8396         mutex_exit(&ibd_objlist_lock);
8397 
8398         if ((err = mac_unregister(node->id_mh)) != 0) {
8399                 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8400                     err);
8401         }
8402 
8403         cv_destroy(&node->id_macst_cv);
8404         mutex_destroy(&node->id_macst_lock);
8405 
8406         kmem_free(node, sizeof (ibd_state_t));
8407 
8408         return (0);
8409 }
8410 
8411 /* ARGSUSED */
8412 static int
8413 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8414     int *rvalp)
8415 {
8416         ibd_ioctl_t             cmd;
8417         ibpart_ioctl_t          partioc;
8418         ibport_ioctl_t          portioc;
8419 #ifdef _MULTI_DATAMODEL
8420         ibport_ioctl32_t        portioc32;
8421 #endif
8422         ibd_state_t             *state, *port_state;
8423         int                     size;
8424         ibt_hca_portinfo_t      *pinfop = NULL;
8425         ibt_status_t            ibt_status;
8426         uint_t                  psize, pinfosz;
8427         int                     rval = 0;
8428 
8429         size = sizeof (ibd_ioctl_t);
8430         if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8431                 return (EFAULT);
8432         }
8433         cmd.ioc_status = 0;
8434         switch (cmd.ioc_info_cmd) {
8435         case IBD_INFO_CMD_IBPART:
8436                 size = sizeof (ibpart_ioctl_t);
8437                 if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8438                         return (EFAULT);
8439                 }
8440 
8441                 mutex_enter(&ibd_objlist_lock);
8442                 /* Find the ibd state structure corresponding the partition */
8443                 for (state = ibd_objlist_head; state; state = state->id_next) {
8444                         if (state->id_plinkid == cmd.ioc_linkid) {
8445                                 break;
8446                         }
8447                 }
8448 
8449                 if (state == NULL) {
8450                         mutex_exit(&ibd_objlist_lock);
8451                         return (ENOENT);
8452                 }
8453 
8454                 partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8455                 partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8456                 partioc.ibdioc.ioc_portnum = state->id_port;
8457                 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8458                 partioc.ibdioc.ioc_portguid = state->id_port_guid;
8459                 partioc.ibdioc.ioc_status = 0;
8460                 partioc.ioc_partid = state->id_plinkid;
8461                 partioc.ioc_pkey = state->id_pkey;
8462                 partioc.ioc_force_create = state->id_force_create;
8463                 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8464                         mutex_exit(&ibd_objlist_lock);
8465                         return (EFAULT);
8466                 }
8467                 mutex_exit(&ibd_objlist_lock);
8468 
8469                 break;
8470 
8471         case IBD_INFO_CMD_IBPORT:
8472                 if ((cmd.ioc_port_inst < 0) || ((port_state =
8473                     ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8474                         DPRINT(10, "ibd_create_partition: failed to get"
8475                             " state %d", cmd.ioc_port_inst);
8476                         size = sizeof (ibd_ioctl_t);
8477                         cmd.ioc_status = IBD_INVALID_PORT_INST;
8478                         if (ddi_copyout((void *)&cmd, (void *)arg, size,
8479                             mode)) {
8480                                 return (EFAULT);
8481                         }
8482                         return (EINVAL);
8483                 }
8484                 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8485                     port_state->id_port, &pinfop, &psize, &pinfosz);
8486                 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8487                         return (EINVAL);
8488                 }
8489 #ifdef _MULTI_DATAMODEL
8490                 switch (ddi_model_convert_from(mode & FMODELS)) {
8491                 case DDI_MODEL_ILP32: {
8492                         size = sizeof (ibport_ioctl32_t);
8493                         if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8494                                 rval = EFAULT;
8495                                 goto fail;
8496                         }
8497                         portioc32.ibdioc.ioc_status = 0;
8498                         portioc32.ibdioc.ioc_portnum = port_state->id_port;
8499                         portioc32.ibdioc.ioc_hcaguid =
8500                             port_state->id_hca_guid;
8501                         portioc32.ibdioc.ioc_portguid =
8502                             port_state->id_port_guid;
8503                         if (portioc32.ioc_pkey_tbl_sz !=
8504                             pinfop->p_pkey_tbl_sz) {
8505                                 rval = EINVAL;
8506                                 size = sizeof (ibd_ioctl_t);
8507                                 portioc32.ibdioc.ioc_status =
8508                                     IBD_INVALID_PKEY_TBL_SIZE;
8509                                 if (ddi_copyout((void *)&portioc32.ibdioc,
8510                                     (void *)arg, size, mode)) {
8511                                         rval = EFAULT;
8512                                         goto fail;
8513                                 }
8514                                 goto fail;
8515                         }
8516                         size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8517                         if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8518                             (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8519                             mode)) {
8520                                 rval = EFAULT;
8521                                 goto fail;
8522                         }
8523                         size = sizeof (ibport_ioctl32_t);
8524                         if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8525                             mode)) {
8526                                 rval = EFAULT;
8527                                 goto fail;
8528                         }
8529                         break;
8530                 }
8531                 case DDI_MODEL_NONE:
8532                         size = sizeof (ibport_ioctl_t);
8533                         if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8534                                 rval = EFAULT;
8535                                 goto fail;
8536                         }
8537                         portioc.ibdioc.ioc_status = 0;
8538                         portioc.ibdioc.ioc_portnum = port_state->id_port;
8539                         portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8540                         portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8541                         if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8542                                 rval = EINVAL;
8543                                 size = sizeof (ibd_ioctl_t);
8544                                 portioc.ibdioc.ioc_status =
8545                                     IBD_INVALID_PKEY_TBL_SIZE;
8546                                 if (ddi_copyout((void *)&portioc.ibdioc,
8547                                     (void *)arg, size, mode)) {
8548                                         rval = EFAULT;
8549                                         goto fail;
8550                                 }
8551                                 goto fail;
8552                         }
8553                         size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8554                         if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8555                             (void *)(portioc.ioc_pkeys), size, mode)) {
8556                                 rval = EFAULT;
8557                                 goto fail;
8558                         }
8559                         size = sizeof (ibport_ioctl_t);
8560                         if (ddi_copyout((void *)&portioc, (void *)arg, size,
8561                             mode)) {
8562                                 rval = EFAULT;
8563                                 goto fail;
8564                         }
8565                         break;
8566                 }
8567 #else /* ! _MULTI_DATAMODEL */
8568                 size = sizeof (ibport_ioctl_t);
8569                 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8570                         rval = EFAULT;
8571                         goto fail;
8572                 }
8573                 portioc.ibdioc.ioc_status = 0;
8574                 portioc.ibdioc.ioc_portnum = port_state->id_port;
8575                 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8576                 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8577                 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8578                         rval = EINVAL;
8579                         size = sizeof (ibd_ioctl_t);
8580                         portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8581                         if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8582                             size, mode)) {
8583                                 rval = EFAULT;
8584                                 goto fail;
8585                         }
8586                         goto fail;
8587                 }
8588                 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8589                 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8590                     (void *)(portioc.ioc_pkeys), size, mode)) {
8591                         rval = EFAULT;
8592                         goto fail;
8593                 }
8594                 size = sizeof (ibport_ioctl_t);
8595                 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8596                     mode)) {
8597                         rval = EFAULT;
8598                         goto fail;
8599                 }
8600 #endif /* _MULTI_DATAMODEL */
8601 
8602                 break;
8603 
8604         case IBD_INFO_CMD_PKEYTBLSZ:
8605                 if ((cmd.ioc_port_inst < 0) || ((port_state =
8606                     ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8607                         DPRINT(10, "ibd_create_partition: failed to get"
8608                             " state %d", cmd.ioc_port_inst);
8609                         size = sizeof (ibd_ioctl_t);
8610                         cmd.ioc_status = IBD_INVALID_PORT_INST;
8611                         if (ddi_copyout((void *)&cmd, (void *)arg, size,
8612                             mode)) {
8613                                 return (EFAULT);
8614                         }
8615                         return (EINVAL);
8616                 }
8617                 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8618                     port_state->id_port, &pinfop, &psize, &pinfosz);
8619                 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8620                         return (EINVAL);
8621                 }
8622 #ifdef _MULTI_DATAMODEL
8623                 switch (ddi_model_convert_from(mode & FMODELS)) {
8624                 case DDI_MODEL_ILP32: {
8625                         size = sizeof (ibport_ioctl32_t);
8626                         if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8627                                 rval = EFAULT;
8628                                 goto fail;
8629                         }
8630                         portioc32.ibdioc.ioc_status = 0;
8631                         portioc32.ibdioc.ioc_portnum = port_state->id_port;
8632                         portioc32.ibdioc.ioc_hcaguid =
8633                             port_state->id_hca_guid;
8634                         portioc32.ibdioc.ioc_portguid =
8635                             port_state->id_port_guid;
8636                         portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8637                         if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8638                             mode)) {
8639                                 rval = EFAULT;
8640                                 goto fail;
8641                         }
8642                         break;
8643                 }
8644                 case DDI_MODEL_NONE:
8645                         size = sizeof (ibport_ioctl_t);
8646                         if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8647                                 rval = EFAULT;
8648                                 goto fail;
8649                         }
8650                         portioc.ibdioc.ioc_status = 0;
8651                         portioc.ibdioc.ioc_portnum = port_state->id_port;
8652                         portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8653                         portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8654                         portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8655                         if (ddi_copyout((void *)&portioc, (void *)arg, size,
8656                             mode)) {
8657                                 rval = EFAULT;
8658                                 goto fail;
8659                         }
8660                         break;
8661                 }
8662 #else /* ! _MULTI_DATAMODEL */
8663                 size = sizeof (ibport_ioctl_t);
8664                 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8665                         rval = EFAULT;
8666                         goto fail;
8667                 }
8668                 portioc.ibdioc.ioc_status = 0;
8669                 portioc.ibdioc.ioc_portnum = port_state->id_port;
8670                 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8671                 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8672                 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8673                 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8674                     mode)) {
8675                         rval = EFAULT;
8676                         goto fail;
8677                 }
8678 #endif /* _MULTI_DATAMODEL */
8679                 break;
8680 
8681         default:
8682                 return (EINVAL);
8683 
8684         } /* switch (cmd.ioc_info_cmd) */
8685 fail:
8686         if (pinfop) {
8687                 ibt_free_portinfo(pinfop, pinfosz);
8688         }
8689         return (rval);
8690 }
8691 
8692 /* ARGSUSED */
8693 static void
8694 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8695     ibt_async_code_t code, ibt_async_event_t *event)
8696 {
8697         ibd_state_t *state = (ibd_state_t *)arg;
8698         link_state_t    lstate;
8699 
8700         switch (code) {
8701         case IBT_EVENT_PORT_UP:
8702         case IBT_ERROR_PORT_DOWN:
8703                 if (ibd_get_port_state(state, &lstate) != 0)
8704                         break;
8705 
8706                 if (state->id_link_state != lstate) {
8707                         state->id_link_state = lstate;
8708                         mac_link_update(state->id_mh, lstate);
8709                 }
8710                 break;
8711         default:
8712                 break;
8713         }
8714 }
8715 
8716 static int
8717 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8718 {
8719         ibt_hca_portinfo_t *port_infop;
8720         uint_t psize, port_infosz;
8721         ibt_status_t    ret;
8722 
8723         ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8724             &port_infop, &psize, &port_infosz);
8725         if ((ret != IBT_SUCCESS) || (psize != 1))
8726                 return (-1);
8727 
8728         state->id_sgid = *port_infop->p_sgid_tbl;
8729         state->id_link_speed = ibd_get_portspeed(state);
8730 
8731         if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8732                 *lstate = LINK_STATE_UP;
8733         else
8734                 *lstate = LINK_STATE_DOWN;
8735 
8736         ibt_free_portinfo(port_infop, port_infosz);
8737         return (0);
8738 }
8739 
8740 static int
8741 ibd_port_attach(dev_info_t *dip)
8742 {
8743         ibd_state_t             *state;
8744         link_state_t            lstate;
8745         int                     instance;
8746         ibt_status_t            ret;
8747 
8748         /*
8749          * Allocate softstate structure
8750          */
8751         instance = ddi_get_instance(dip);
8752         if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8753                 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8754                 return (DDI_FAILURE);
8755         }
8756 
8757         state = ddi_get_soft_state(ibd_list, instance);
8758 
8759         state->id_dip = dip;
8760         state->id_type = IBD_PORT_DRIVER;
8761 
8762         if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8763             "port-number", 0)) == 0) {
8764                 DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8765                     state->id_port);
8766                 return (DDI_FAILURE);
8767         }
8768         if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8769             "hca-guid", 0)) == 0) {
8770                 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8771                     state->id_hca_guid);
8772                 return (DDI_FAILURE);
8773         }
8774         if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8775             "port-guid", 0)) == 0) {
8776                 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8777                     state->id_port_guid);
8778                 return (DDI_FAILURE);
8779         }
8780 
8781         /*
8782          * Attach to IBTL
8783          */
8784         if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8785             &state->id_ibt_hdl)) != IBT_SUCCESS) {
8786                 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8787                     ret);
8788                 goto done;
8789         }
8790 
8791         state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8792 
8793         if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8794             &state->id_hca_hdl)) != IBT_SUCCESS) {
8795                 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8796                     ret);
8797                 goto done;
8798         }
8799         state->id_mac_state |= IBD_DRV_HCA_OPENED;
8800 
8801         /* Update link status */
8802 
8803         if (ibd_get_port_state(state, &lstate) != 0) {
8804                 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8805                     ret);
8806                 goto done;
8807         }
8808         state->id_link_state = lstate;
8809         /*
8810          * Register ibd interfaces with the Nemo framework
8811          */
8812         if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8813                 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8814                 goto done;
8815         }
8816         state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8817 
8818         mac_link_update(state->id_mh, lstate);
8819 
8820         return (DDI_SUCCESS);
8821 done:
8822         (void) ibd_port_unattach(state, dip);
8823         return (DDI_FAILURE);
8824 }
8825 
8826 static int
8827 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8828 {
8829         int instance;
8830         uint32_t progress = state->id_mac_state;
8831         ibt_status_t ret;
8832 
8833         if (progress & IBD_DRV_MAC_REGISTERED) {
8834                 (void) mac_unregister(state->id_mh);
8835                 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8836         }
8837 
8838         if (progress & IBD_DRV_HCA_OPENED) {
8839                 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8840                     IBT_SUCCESS) {
8841                         ibd_print_warn(state, "failed to close "
8842                             "HCA device, ret=%d", ret);
8843                 }
8844                 state->id_hca_hdl = NULL;
8845                 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8846         }
8847 
8848         if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8849                 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8850                         ibd_print_warn(state,
8851                             "ibt_detach() failed, ret=%d", ret);
8852                 }
8853                 state->id_ibt_hdl = NULL;
8854                 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8855         }
8856         instance = ddi_get_instance(dip);
8857         ddi_soft_state_free(ibd_list, instance);
8858 
8859         return (DDI_SUCCESS);
8860 }
8861 
8862 ibt_status_t
8863 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8864 {
8865         ibd_state_t     *state;
8866 
8867         mutex_enter(&ibd_objlist_lock);
8868 
8869         /* Find the ibd state structure corresponding the partition */
8870         for (state = ibd_objlist_head; state; state = state->id_next) {
8871                 if (state->id_plinkid == linkid) {
8872                         break;
8873                 }
8874         }
8875 
8876         if (state == NULL) {
8877                 mutex_exit(&ibd_objlist_lock);
8878                 return (IBT_NO_SUCH_OBJECT);
8879         }
8880 
8881         attr->pa_dlinkid = state->id_dlinkid;
8882         attr->pa_plinkid = state->id_plinkid;
8883         attr->pa_port = state->id_port;
8884         attr->pa_hca_guid = state->id_hca_guid;
8885         attr->pa_port_guid = state->id_port_guid;
8886         attr->pa_pkey = state->id_pkey;
8887 
8888         mutex_exit(&ibd_objlist_lock);
8889 
8890         return (IBT_SUCCESS);
8891 }
8892 
8893 ibt_status_t
8894 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8895 {
8896         ibd_state_t     *state;
8897         int             n = 0;
8898         ibt_part_attr_t *attr;
8899 
8900         mutex_enter(&ibd_objlist_lock);
8901 
8902         for (state = ibd_objlist_head; state; state = state->id_next)
8903                 n++;
8904 
8905         *nparts = n;
8906         if (n == 0) {
8907                 *attr_list = NULL;
8908                 mutex_exit(&ibd_objlist_lock);
8909                 return (IBT_SUCCESS);
8910         }
8911 
8912         *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8913         attr = *attr_list;
8914         for (state = ibd_objlist_head; state; state = state->id_next) {
8915 #ifdef DEBUG
8916                 ASSERT(n > 0);
8917                 n--;
8918 #endif
8919                 attr->pa_dlinkid = state->id_dlinkid;
8920                 attr->pa_plinkid = state->id_plinkid;
8921                 attr->pa_port = state->id_port;
8922                 attr->pa_hca_guid = state->id_hca_guid;
8923                 attr->pa_port_guid = state->id_port_guid;
8924                 attr->pa_pkey = state->id_pkey;
8925                 attr++;
8926         }
8927 
8928         mutex_exit(&ibd_objlist_lock);
8929         return (IBT_SUCCESS);
8930 }