1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * An implementation of the IPoIB standard based on PSARC 2001/289.
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/conf.h>
  32 #include <sys/ddi.h>
  33 #include <sys/sunddi.h>
  34 #include <sys/modctl.h>
  35 #include <sys/stropts.h>
  36 #include <sys/stream.h>
  37 #include <sys/strsun.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/dlpi.h>
  40 #include <sys/mac_provider.h>
  41 
  42 #include <sys/pattr.h>            /* for HCK_FULLCKSUM */
  43 #include <sys/sysmacros.h>        /* for offsetof */
  44 #include <sys/disp.h>             /* for async thread pri */
  45 #include <sys/atomic.h>           /* for atomic_add*() */
  46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */
  47 #include <netinet/in.h>           /* for netinet/ip.h below */
  48 #include <netinet/ip.h>           /* for struct ip */
  49 #include <netinet/udp.h>  /* for struct udphdr */
  50 #include <inet/common.h>  /* for inet/ip.h below */
  51 #include <inet/ip.h>              /* for ipha_t */
  52 #include <inet/ip6.h>             /* for ip6_t */
  53 #include <inet/tcp.h>             /* for tcph_t */
  54 #include <netinet/icmp6.h>        /* for icmp6_t */
  55 #include <sys/callb.h>
  56 #include <sys/modhash.h>
  57 
  58 #include <sys/ib/clients/ibd/ibd.h>
  59 #include <sys/ib/mgt/sm_attr.h>   /* for SM_INIT_TYPE_* */
  60 #include <sys/note.h>
  61 #include <sys/multidata.h>
  62 
  63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */
  64 
  65 #include <sys/priv_names.h>
  66 #include <sys/dls.h>
  67 #include <sys/dld_ioc.h>
  68 #include <sys/policy.h>
  69 #include <sys/ibpart.h>
  70 #include <sys/file.h>
  71 
  72 /*
  73  * The write-up below includes details on the following:
  74  * 1. The dladm administrative model.
  75  * 2. Late HCA initialization feature.
  76  * 3. Brussels support and its implications to the current architecture.
  77  *
  78  * 1. The dladm administrative model.
  79  * ------------------------------------------
  80  * With the dladm model, ibnex will create one ibd instance per port. These
  81  * instances will be created independent of the port state.
  82  *
  83  * The ibd driver is two faceted: One side of it working as the port driver and
  84  * the other as the partition object driver.
  85  *
  86  * The port instance is a child of the HCA, and will have an entry in the devfs.
  87  * A DDI attach only happens for the port driver, and its attach is
  88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
  89  * handled in ibd_port_unattach().
  90  *
  91  * The partition object is only a registrant to the mac layer via mac_register()
  92  * and does not have an entry in the device tree. There is no DDI softstate
  93  * managed by the DDI framework for the partition objects. However, the state is
  94  * managed inside the ibd driver, and every partition object hangs off the
  95  * "ibd_objlist_head".
  96  *
  97  * The partition object first comes into existence when a user runs the
  98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
  99  * point of the partition object. The partition object goes away with the
 100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
 101  * point of the partition object.
 102  *
 103  * The create-part and delete-part subcommands result in dld ioctls that end up
 104  * calling ibd_create_parition() and ibd_delete_partition respectively.
 105  * There ioctls are registered with the dld layer in _init() via a call to
 106  * dld_ioc_register().
 107  *
 108  * The port instance by itself cannot be plumbed. It is only the partition
 109  * objects that can be plumbed and they alone participate in I/O and not the
 110  * port driver.
 111  *
 112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
 113  * display useful information. The info entry point for ibd is
 114  * ibd_get_partition_info().
 115  *
 116  * 2. Late HCA initialization feature.
 117  * ------------------------------------
 118  * As mentioned in section 1, the user creates the partition objects via
 119  * dladm(1M). It is possible that:
 120  * a) The physical port itself is down and the SM cannot be reached.
 121  * b) The PKEY specified by the used has not been created in the SM yet.
 122  * c) An IPoIB broadcast group for the specified PKEY is not present.
 123  *
 124  * In all of the above cases, complete initialization of the partition object is
 125  * not possible. However, the new model allows the creation of partition
 126  * objects even in such cases but will defer the initialization for later.
 127  * When such a partition object is plumbed, the link state will be displayed as
 128  * "down".
 129  * The driver, at this point, is listening to events that herald the
 130  * availability of resources -
 131  * i)   LINK_UP when the link becomes available
 132  * ii)  PORT_CHANGE when the PKEY has been created
 133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
 134  * created
 135  * via ibd_async_handler() for events i) and ii), and via
 136  * ibd_snet_notices_handler() for iii.
 137  * The driver handles these events (as and when they arrive) and completes the
 138  * initialization of the partition object and transitions it to a usable state.
 139  *
 140  * 3. Brussels support and its implications to the current architecture.
 141  * ---------------------------------------------------------------------
 142  * The brussels support introduces two new interfaces to the ibd driver -
 143  * ibd_m_getprop() and ibd_m_setprop().
 144  * These interfaces allow setting and retrieval of certain properties.
 145  * Some of them are public properties while most other are private properties
 146  * meant to be used by developers. Tuning the latter kind can cause
 147  * performance issues and should not be used without understanding the
 148  * implications. All properties are specific to an instance of either the
 149  * partition object or the port driver.
 150  *
 151  * The public properties are : mtu and linkmode.
 152  * mtu is a read-only property.
 153  * linkmode can take two values - UD and CM.
 154  *
 155  * Changing the linkmode requires some bookkeeping in the driver. The
 156  * capabilities need to be re-reported to the mac layer. This is done by
 157  * calling mac_capab_update().  The maxsdu is updated by calling
 158  * mac_maxsdu_update2().
 159  * The private properties retain their values across the change of linkmode.
 160  * NOTE:
 161  * - The port driver does not support any property apart from mtu.
 162  * - All other properties are only meant for the partition object.
 163  * - The properties cannot be set when an instance is plumbed. The
 164  * instance has to be unplumbed to effect any setting.
 165  */
 166 
 167 /*
 168  * Driver wide tunables
 169  *
 170  * ibd_tx_softintr
 171  * ibd_rx_softintr
 172  *     The softintr mechanism allows ibd to avoid event queue overflows if
 173  *     the receive/completion handlers are to be expensive. These are enabled
 174  *     by default.
 175  *
 176  * ibd_log_sz
 177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
 178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
 179  *
 180  */
 181 uint_t ibd_rx_softintr = 1;
 182 uint_t ibd_tx_softintr = 1;
 183 
 184 #ifdef IBD_LOGGING
 185 uint_t ibd_log_sz = 0x20000;
 186 #endif
 187 
 188 #ifdef IBD_LOGGING
 189 #define IBD_LOG_SZ                      ibd_log_sz
 190 #endif
 191 
 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
 193 #define IBD_RX_POST_CNT                 8
 194 
 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
 196 #define IBD_LOG_RX_POST                 4
 197 
 198 /* Minimum number of receive work requests driver needs to always have */
 199 #define IBD_RWQE_MIN    ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
 200 
 201 /*
 202  * LSO parameters
 203  */
 204 #define IBD_LSO_MAXLEN                  65536
 205 #define IBD_LSO_BUFSZ                   8192
 206 
 207 /*
 208  * Async operation states
 209  */
 210 #define IBD_OP_NOTSTARTED               0
 211 #define IBD_OP_ONGOING                  1
 212 #define IBD_OP_COMPLETED                2
 213 #define IBD_OP_ERRORED                  3
 214 #define IBD_OP_ROUTERED                 4
 215 
 216 /*
 217  * Start/stop in-progress flags; note that restart must always remain
 218  * the OR of start and stop flag values.
 219  */
 220 #define IBD_DRV_START_IN_PROGRESS       0x10000000
 221 #define IBD_DRV_STOP_IN_PROGRESS        0x20000000
 222 #define IBD_DRV_RESTART_IN_PROGRESS     0x30000000
 223 #define IBD_DRV_DELETE_IN_PROGRESS      IBD_DRV_RESTART_IN_PROGRESS
 224 
 225 /*
 226  * Miscellaneous constants
 227  */
 228 #define IB_MGID_IPV4_LOWGRP_MASK        0xFFFFFFFF
 229 #define IBD_DEF_MAX_SDU                 2044
 230 #define IBD_DEF_MAX_MTU                 (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
 231 #define IBD_DEF_RC_MAX_SDU              65520
 232 #define IBD_DEF_RC_MAX_MTU              (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
 233 #define IBD_DEFAULT_QKEY                0xB1B
 234 #ifdef IBD_LOGGING
 235 #define IBD_DMAX_LINE                   100
 236 #endif
 237 
 238 /*
 239  * Enumerations for link states
 240  */
 241 typedef enum {
 242         IBD_LINK_DOWN,
 243         IBD_LINK_UP,
 244         IBD_LINK_UP_ABSENT
 245 } ibd_link_op_t;
 246 
 247 /*
 248  * Driver State Pointer
 249  */
 250 void *ibd_list;
 251 
 252 /*
 253  * Driver Global Data
 254  */
 255 ibd_global_state_t ibd_gstate;
 256 
 257 /*
 258  * Partition object list
 259  */
 260 ibd_state_t     *ibd_objlist_head = NULL;
 261 kmutex_t        ibd_objlist_lock;
 262 
 263 int ibd_rc_conn_timeout = 60 * 10;      /* 10 minutes */
 264 
 265 /*
 266  * Logging
 267  */
 268 #ifdef IBD_LOGGING
 269 kmutex_t ibd_lbuf_lock;
 270 uint8_t *ibd_lbuf;
 271 uint32_t ibd_lbuf_ndx;
 272 #endif
 273 
 274 /*
 275  * Required system entry points
 276  */
 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
 279 
 280 /*
 281  * Required driver entry points for GLDv3
 282  */
 283 static int ibd_m_stat(void *, uint_t, uint64_t *);
 284 static int ibd_m_start(void *);
 285 static void ibd_m_stop(void *);
 286 static int ibd_m_promisc(void *, boolean_t);
 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
 288 static int ibd_m_unicst(void *, const uint8_t *);
 289 static mblk_t *ibd_m_tx(void *, mblk_t *);
 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
 291 
 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 293     const void *);
 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
 296     mac_prop_info_handle_t);
 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
 298     const void *);
 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
 300 
 301 /*
 302  * Private driver entry points for GLDv3
 303  */
 304 
 305 /*
 306  * Initialization
 307  */
 308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
 309 static int ibd_init_txlist(ibd_state_t *);
 310 static int ibd_init_rxlist(ibd_state_t *);
 311 static int ibd_acache_init(ibd_state_t *);
 312 #ifdef IBD_LOGGING
 313 static void ibd_log_init(void);
 314 #endif
 315 
 316 /*
 317  * Termination/cleanup
 318  */
 319 static void ibd_state_fini(ibd_state_t *);
 320 static void ibd_fini_txlist(ibd_state_t *);
 321 static void ibd_fini_rxlist(ibd_state_t *);
 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
 324 static void ibd_acache_fini(ibd_state_t *);
 325 #ifdef IBD_LOGGING
 326 static void ibd_log_fini(void);
 327 #endif
 328 
 329 /*
 330  * Allocation/acquire/map routines
 331  */
 332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
 333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
 337     uint32_t *);
 338 
 339 /*
 340  * Free/release/unmap routines
 341  */
 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
 343 static void ibd_free_tx_copybufs(ibd_state_t *);
 344 static void ibd_free_rx_copybufs(ibd_state_t *);
 345 static void ibd_free_rx_rsrcs(ibd_state_t *);
 346 static void ibd_free_tx_lsobufs(ibd_state_t *);
 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
 350 
 351 /*
 352  * Handlers/callback routines
 353  */
 354 static uint_t ibd_intr(caddr_t);
 355 static uint_t ibd_tx_recycle(caddr_t);
 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
 362 static void ibd_freemsg_cb(char *);
 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
 364     ibt_async_event_t *);
 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
 366     ibt_async_event_t *);
 367 static void ibd_snet_notices_handler(void *, ib_gid_t,
 368     ibt_subnet_event_code_t, ibt_subnet_event_t *);
 369 
 370 /*
 371  * Send/receive routines
 372  */
 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
 377 
 378 /*
 379  * Threads
 380  */
 381 static void ibd_async_work(ibd_state_t *);
 382 
 383 /*
 384  * Async tasks
 385  */
 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
 388 static void ibd_async_setprom(ibd_state_t *);
 389 static void ibd_async_unsetprom(ibd_state_t *);
 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
 392 static void ibd_async_txsched(ibd_state_t *);
 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
 394 
 395 /*
 396  * Async task helpers
 397  */
 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
 402     ipoib_mac_t *, ipoib_mac_t *);
 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
 409 static uint64_t ibd_get_portspeed(ibd_state_t *);
 410 static boolean_t ibd_async_safe(ibd_state_t *);
 411 static void ibd_async_done(ibd_state_t *);
 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
 416 
 417 /*
 418  * Helpers for attach/start routines
 419  */
 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
 421 static int ibd_record_capab(ibd_state_t *);
 422 static int ibd_get_port_details(ibd_state_t *);
 423 static int ibd_alloc_cqs(ibd_state_t *);
 424 static int ibd_setup_ud_channel(ibd_state_t *);
 425 static int ibd_start(ibd_state_t *);
 426 static int ibd_undo_start(ibd_state_t *, link_state_t);
 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
 430 static void ibd_part_unattach(ibd_state_t *state);
 431 static int ibd_port_attach(dev_info_t *);
 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
 434 static int ibd_part_busy(ibd_state_t *);
 435 
 436 /*
 437  * Miscellaneous helpers
 438  */
 439 static int ibd_sched_poll(ibd_state_t *, int, int);
 440 static void ibd_resume_transmission(ibd_state_t *);
 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
 443 static void *list_get_head(list_t *);
 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
 446 
 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
 449 
 450 #ifdef IBD_LOGGING
 451 static void ibd_log(const char *, ...);
 452 #endif
 453 
 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
 455     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
 456 
 457 /* Module Driver Info */
 458 static struct modldrv ibd_modldrv = {
 459         &mod_driverops,                     /* This one is a driver */
 460         "InfiniBand GLDv3 Driver",      /* short description */
 461         &ibd_dev_ops                        /* driver specific ops */
 462 };
 463 
 464 /* Module Linkage */
 465 static struct modlinkage ibd_modlinkage = {
 466         MODREV_1, (void *)&ibd_modldrv, NULL
 467 };
 468 
 469 /*
 470  * Module (static) info passed to IBTL during ibt_attach
 471  */
 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
 473         IBTI_V_CURR,
 474         IBT_NETWORK,
 475         ibd_async_handler,
 476         NULL,
 477         "IBPART"
 478 };
 479 
 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
 481         IBTI_V_CURR,
 482         IBT_NETWORK,
 483         ibdpd_async_handler,
 484         NULL,
 485         "IPIB"
 486 };
 487 
 488 /*
 489  * GLDv3 entry points
 490  */
 491 #define IBD_M_CALLBACK_FLAGS    \
 492         (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
 493 
 494 static mac_callbacks_t ibd_m_callbacks = {
 495         IBD_M_CALLBACK_FLAGS,
 496         ibd_m_stat,
 497         ibd_m_start,
 498         ibd_m_stop,
 499         ibd_m_promisc,
 500         ibd_m_multicst,
 501         ibd_m_unicst,
 502         ibd_m_tx,
 503         NULL,
 504         NULL,
 505         ibd_m_getcapab,
 506         NULL,
 507         NULL,
 508         ibd_m_setprop,
 509         ibd_m_getprop,
 510         ibd_m_propinfo
 511 };
 512 
 513 /* Private properties */
 514 char *ibd_priv_props[] = {
 515         "_ibd_broadcast_group",
 516         "_ibd_coalesce_completions",
 517         "_ibd_create_broadcast_group",
 518         "_ibd_hash_size",
 519         "_ibd_lso_enable",
 520         "_ibd_num_ah",
 521         "_ibd_num_lso_bufs",
 522         "_ibd_rc_enable_srq",
 523         "_ibd_rc_num_rwqe",
 524         "_ibd_rc_num_srq",
 525         "_ibd_rc_num_swqe",
 526         "_ibd_rc_rx_comp_count",
 527         "_ibd_rc_rx_comp_usec",
 528         "_ibd_rc_rx_copy_thresh",
 529         "_ibd_rc_rx_rwqe_thresh",
 530         "_ibd_rc_tx_comp_count",
 531         "_ibd_rc_tx_comp_usec",
 532         "_ibd_rc_tx_copy_thresh",
 533         "_ibd_ud_num_rwqe",
 534         "_ibd_ud_num_swqe",
 535         "_ibd_ud_rx_comp_count",
 536         "_ibd_ud_rx_comp_usec",
 537         "_ibd_ud_tx_comp_count",
 538         "_ibd_ud_tx_comp_usec",
 539         "_ibd_ud_tx_copy_thresh",
 540         NULL
 541 };
 542 
 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
 546 
 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
 548         {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
 549             ibd_create_partition, secpolicy_dl_config},
 550         {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
 551             ibd_delete_partition, secpolicy_dl_config},
 552         {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
 553             ibd_get_partition_info, NULL}
 554 };
 555 
 556 /*
 557  * Fill/clear <scope> and <p_key> in multicast/broadcast address
 558  */
 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)         \
 560 {                                                       \
 561         *(uint32_t *)((char *)(maddr) + 4) |=           \
 562             htonl((uint32_t)(scope) << 16);               \
 563         *(uint32_t *)((char *)(maddr) + 8) |=           \
 564             htonl((uint32_t)(pkey) << 16);                \
 565 }
 566 
 567 #define IBD_CLEAR_SCOPE_PKEY(maddr)                     \
 568 {                                                       \
 569         *(uint32_t *)((char *)(maddr) + 4) &=               \
 570             htonl(~((uint32_t)0xF << 16));                \
 571         *(uint32_t *)((char *)(maddr) + 8) &=               \
 572             htonl(~((uint32_t)0xFFFF << 16));             \
 573 }
 574 
 575 /*
 576  * Rudimentary debugging support
 577  */
 578 #ifdef DEBUG
 579 int ibd_debuglevel = 100;
 580 void
 581 debug_print(int l, char *fmt, ...)
 582 {
 583         va_list ap;
 584 
 585         if (l < ibd_debuglevel)
 586                 return;
 587         va_start(ap, fmt);
 588         vcmn_err(CE_CONT, fmt, ap);
 589         va_end(ap);
 590 }
 591 #endif
 592 
 593 /*
 594  * Common routine to print warning messages; adds in hca guid, port number
 595  * and pkey to be able to identify the IBA interface.
 596  */
 597 void
 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
 599 {
 600         ib_guid_t hca_guid;
 601         char ibd_print_buf[MAXNAMELEN + 256];
 602         int len;
 603         va_list ap;
 604         char part_name[MAXNAMELEN];
 605         datalink_id_t linkid = state->id_plinkid;
 606 
 607         hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
 608             0, "hca-guid", 0);
 609         (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
 610         len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
 611             "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
 612             ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
 613             (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
 614             part_name);
 615         va_start(ap, fmt);
 616         (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
 617             fmt, ap);
 618         cmn_err(CE_NOTE, "!%s", ibd_print_buf);
 619         va_end(ap);
 620 }
 621 
 622 int
 623 _init()
 624 {
 625         int status;
 626 
 627         status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
 628             PAGESIZE), 0);
 629         if (status != 0) {
 630                 DPRINT(10, "_init:failed in ddi_soft_state_init()");
 631                 return (status);
 632         }
 633 
 634         mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
 635 
 636         mac_init_ops(&ibd_dev_ops, "ibp");
 637         status = mod_install(&ibd_modlinkage);
 638         if (status != 0) {
 639                 DPRINT(10, "_init:failed in mod_install()");
 640                 ddi_soft_state_fini(&ibd_list);
 641                 mac_fini_ops(&ibd_dev_ops);
 642                 return (status);
 643         }
 644 
 645         mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
 646         mutex_enter(&ibd_gstate.ig_mutex);
 647         ibd_gstate.ig_ibt_hdl = NULL;
 648         ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
 649         ibd_gstate.ig_service_list = NULL;
 650         mutex_exit(&ibd_gstate.ig_mutex);
 651 
 652         if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
 653             DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
 654                 return (EIO);
 655         }
 656 
 657         ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
 658 
 659 #ifdef IBD_LOGGING
 660         ibd_log_init();
 661 #endif
 662         return (0);
 663 }
 664 
 665 int
 666 _info(struct modinfo *modinfop)
 667 {
 668         return (mod_info(&ibd_modlinkage, modinfop));
 669 }
 670 
 671 int
 672 _fini()
 673 {
 674         int status;
 675 
 676         status = mod_remove(&ibd_modlinkage);
 677         if (status != 0)
 678                 return (status);
 679 
 680         ibt_unregister_part_attr_cb();
 681 
 682         mac_fini_ops(&ibd_dev_ops);
 683         mutex_destroy(&ibd_objlist_lock);
 684         ddi_soft_state_fini(&ibd_list);
 685         mutex_destroy(&ibd_gstate.ig_mutex);
 686 #ifdef IBD_LOGGING
 687         ibd_log_fini();
 688 #endif
 689         return (0);
 690 }
 691 
 692 /*
 693  * Convert the GID part of the mac address from network byte order
 694  * to host order.
 695  */
 696 static void
 697 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
 698 {
 699         ib_sn_prefix_t nbopref;
 700         ib_guid_t nboguid;
 701 
 702         bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
 703         bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
 704         dgid->gid_prefix = b2h64(nbopref);
 705         dgid->gid_guid = b2h64(nboguid);
 706 }
 707 
 708 /*
 709  * Create the IPoIB address in network byte order from host order inputs.
 710  */
 711 static void
 712 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
 713     ib_guid_t guid)
 714 {
 715         ib_sn_prefix_t nbopref;
 716         ib_guid_t nboguid;
 717 
 718         mac->ipoib_qpn = htonl(qpn);
 719         nbopref = h2b64(prefix);
 720         nboguid = h2b64(guid);
 721         bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
 722         bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
 723 }
 724 
 725 /*
 726  * Send to the appropriate all-routers group when the IBA multicast group
 727  * does not exist, based on whether the target group is v4 or v6.
 728  */
 729 static boolean_t
 730 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
 731     ipoib_mac_t *rmac)
 732 {
 733         boolean_t retval = B_TRUE;
 734         uint32_t adjscope = state->id_scope << 16;
 735         uint32_t topword;
 736 
 737         /*
 738          * Copy the first 4 bytes in without assuming any alignment of
 739          * input mac address; this will have IPoIB signature, flags and
 740          * scope bits.
 741          */
 742         bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
 743         topword = ntohl(topword);
 744 
 745         /*
 746          * Generate proper address for IPv4/v6, adding in the Pkey properly.
 747          */
 748         if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
 749             (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
 750                 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
 751                     ((uint32_t)(state->id_pkey << 16))),
 752                     (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
 753         else
 754                 /*
 755                  * Does not have proper bits in the mgid address.
 756                  */
 757                 retval = B_FALSE;
 758 
 759         return (retval);
 760 }
 761 
 762 /*
 763  * Membership states for different mcg's are tracked by two lists:
 764  * the "non" list is used for promiscuous mode, when all mcg traffic
 765  * needs to be inspected. This type of membership is never used for
 766  * transmission, so there can not be an AH in the active list
 767  * corresponding to a member in this list. This list does not need
 768  * any protection, since all operations are performed by the async
 769  * thread.
 770  *
 771  * "Full" and "SendOnly" membership is tracked using a single list,
 772  * the "full" list. This is because this single list can then be
 773  * searched during transmit to a multicast group (if an AH for the
 774  * mcg is not found in the active list), since at least one type
 775  * of membership must be present before initiating the transmit.
 776  * This list is also emptied during driver detach, since sendonly
 777  * membership acquired during transmit is dropped at detach time
 778  * along with ipv4 broadcast full membership. Insert/deletes to
 779  * this list are done only by the async thread, but it is also
 780  * searched in program context (see multicast disable case), thus
 781  * the id_mc_mutex protects the list. The driver detach path also
 782  * deconstructs the "full" list, but it ensures that the async
 783  * thread will not be accessing the list (by blocking out mcg
 784  * trap handling and making sure no more Tx reaping will happen).
 785  *
 786  * Currently, an IBA attach is done in the SendOnly case too,
 787  * although this is not required.
 788  */
 789 #define IBD_MCACHE_INSERT_FULL(state, mce) \
 790         list_insert_head(&state->id_mc_full, mce)
 791 #define IBD_MCACHE_INSERT_NON(state, mce) \
 792         list_insert_head(&state->id_mc_non, mce)
 793 #define IBD_MCACHE_FIND_FULL(state, mgid) \
 794         ibd_mcache_find(mgid, &state->id_mc_full)
 795 #define IBD_MCACHE_FIND_NON(state, mgid) \
 796         ibd_mcache_find(mgid, &state->id_mc_non)
 797 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \
 798         list_remove(&state->id_mc_full, mce)
 799 #define IBD_MCACHE_PULLOUT_NON(state, mce) \
 800         list_remove(&state->id_mc_non, mce)
 801 
 802 static void *
 803 list_get_head(list_t *list)
 804 {
 805         list_node_t *lhead = list_head(list);
 806 
 807         if (lhead != NULL)
 808                 list_remove(list, lhead);
 809         return (lhead);
 810 }
 811 
 812 /*
 813  * This is always guaranteed to be able to queue the work.
 814  */
 815 void
 816 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
 817 {
 818         /* Initialize request */
 819         DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
 820         ptr->rq_op = op;
 821 
 822         /*
 823          * Queue provided slot onto request pool.
 824          */
 825         mutex_enter(&state->id_acache_req_lock);
 826         list_insert_tail(&state->id_req_list, ptr);
 827 
 828         /* Go, fetch, async thread */
 829         cv_signal(&state->id_acache_req_cv);
 830         mutex_exit(&state->id_acache_req_lock);
 831 }
 832 
 833 /*
 834  * Main body of the per interface async thread.
 835  */
 836 static void
 837 ibd_async_work(ibd_state_t *state)
 838 {
 839         ibd_req_t *ptr;
 840         callb_cpr_t cprinfo;
 841 
 842         mutex_enter(&state->id_acache_req_lock);
 843         CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
 844             callb_generic_cpr, "ibd_async_work");
 845 
 846         for (;;) {
 847                 ptr = list_get_head(&state->id_req_list);
 848                 if (ptr != NULL) {
 849                         mutex_exit(&state->id_acache_req_lock);
 850 
 851                         /*
 852                          * If we are in late hca initialization mode, do not
 853                          * process any other async request other than TRAP. TRAP
 854                          * is used for indicating creation of a broadcast group;
 855                          * in which case, we need to join/create the group.
 856                          */
 857                         if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
 858                             (ptr->rq_op != IBD_ASYNC_TRAP)) {
 859                                 goto free_req_and_continue;
 860                         }
 861 
 862                         /*
 863                          * Once we have done the operation, there is no
 864                          * guarantee the request slot is going to be valid,
 865                          * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
 866                          * TRAP).
 867                          *
 868                          * Perform the request.
 869                          */
 870                         switch (ptr->rq_op) {
 871                                 case IBD_ASYNC_GETAH:
 872                                         ibd_async_acache(state, &ptr->rq_mac);
 873                                         break;
 874                                 case IBD_ASYNC_JOIN:
 875                                 case IBD_ASYNC_LEAVE:
 876                                         ibd_async_multicast(state,
 877                                             ptr->rq_gid, ptr->rq_op);
 878                                         break;
 879                                 case IBD_ASYNC_PROMON:
 880                                         ibd_async_setprom(state);
 881                                         break;
 882                                 case IBD_ASYNC_PROMOFF:
 883                                         ibd_async_unsetprom(state);
 884                                         break;
 885                                 case IBD_ASYNC_REAP:
 886                                         ibd_async_reap_group(state,
 887                                             ptr->rq_ptr, ptr->rq_gid,
 888                                             IB_MC_JSTATE_FULL);
 889                                         /*
 890                                          * the req buf contains in mce
 891                                          * structure, so we do not need
 892                                          * to free it here.
 893                                          */
 894                                         ptr = NULL;
 895                                         break;
 896                                 case IBD_ASYNC_TRAP:
 897                                         ibd_async_trap(state, ptr);
 898                                         break;
 899                                 case IBD_ASYNC_SCHED:
 900                                         ibd_async_txsched(state);
 901                                         break;
 902                                 case IBD_ASYNC_LINK:
 903                                         ibd_async_link(state, ptr);
 904                                         break;
 905                                 case IBD_ASYNC_EXIT:
 906                                         mutex_enter(&state->id_acache_req_lock);
 907                                         CALLB_CPR_EXIT(&cprinfo);
 908                                         return;
 909                                 case IBD_ASYNC_RC_TOO_BIG:
 910                                         ibd_async_rc_process_too_big(state,
 911                                             ptr);
 912                                         break;
 913                                 case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
 914                                         ibd_async_rc_close_act_chan(state, ptr);
 915                                         break;
 916                                 case IBD_ASYNC_RC_RECYCLE_ACE:
 917                                         ibd_async_rc_recycle_ace(state, ptr);
 918                                         break;
 919                                 case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
 920                                         (void) ibd_rc_pas_close(ptr->rq_ptr,
 921                                             B_TRUE, B_TRUE);
 922                                         break;
 923                         }
 924 free_req_and_continue:
 925                         if (ptr != NULL)
 926                                 kmem_cache_free(state->id_req_kmc, ptr);
 927 
 928                         mutex_enter(&state->id_acache_req_lock);
 929                 } else {
 930                         /*
 931                          * Nothing to do: wait till new request arrives.
 932                          */
 933                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 934                         cv_wait(&state->id_acache_req_cv,
 935                             &state->id_acache_req_lock);
 936                         CALLB_CPR_SAFE_END(&cprinfo,
 937                             &state->id_acache_req_lock);
 938                 }
 939         }
 940 
 941         /*NOTREACHED*/
 942         _NOTE(NOT_REACHED)
 943 }
 944 
 945 /*
 946  * Return when it is safe to queue requests to the async daemon; primarily
 947  * for subnet trap and async event handling. Disallow requests before the
 948  * daemon is created, and when interface deinitilization starts.
 949  */
 950 static boolean_t
 951 ibd_async_safe(ibd_state_t *state)
 952 {
 953         mutex_enter(&state->id_trap_lock);
 954         if (state->id_trap_stop) {
 955                 mutex_exit(&state->id_trap_lock);
 956                 return (B_FALSE);
 957         }
 958         state->id_trap_inprog++;
 959         mutex_exit(&state->id_trap_lock);
 960         return (B_TRUE);
 961 }
 962 
 963 /*
 964  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
 965  * trap or event handling to complete to kill the async thread and deconstruct
 966  * the mcg/ace list.
 967  */
 968 static void
 969 ibd_async_done(ibd_state_t *state)
 970 {
 971         mutex_enter(&state->id_trap_lock);
 972         if (--state->id_trap_inprog == 0)
 973                 cv_signal(&state->id_trap_cv);
 974         mutex_exit(&state->id_trap_lock);
 975 }
 976 
 977 /*
 978  * Hash functions:
 979  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
 980  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
 981  * These operate on mac addresses input into ibd_send, but there is no
 982  * guarantee on the alignment of the ipoib_mac_t structure.
 983  */
 984 /*ARGSUSED*/
 985 static uint_t
 986 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
 987 {
 988         ulong_t ptraddr = (ulong_t)key;
 989         uint_t hval;
 990 
 991         /*
 992          * If the input address is 4 byte aligned, we can just dereference
 993          * it. This is most common, since IP will send in a 4 byte aligned
 994          * IP header, which implies the 24 byte IPoIB psuedo header will be
 995          * 4 byte aligned too.
 996          */
 997         if ((ptraddr & 3) == 0)
 998                 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
 999 
1000         bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1001         return (hval);
1002 }
1003 
1004 static int
1005 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1006 {
1007         if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1008                 return (0);
1009         else
1010                 return (1);
1011 }
1012 
1013 /*
1014  * Initialize all the per interface caches and lists; AH cache,
1015  * MCG list etc.
1016  */
1017 static int
1018 ibd_acache_init(ibd_state_t *state)
1019 {
1020         ibd_ace_t *ce;
1021         int i;
1022 
1023         mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1024         mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1025         mutex_enter(&state->id_ac_mutex);
1026         list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1027             offsetof(ibd_ace_t, ac_list));
1028         list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1029             offsetof(ibd_ace_t, ac_list));
1030         state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1031             state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1032             ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1033         list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1034             offsetof(ibd_mce_t, mc_list));
1035         list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1036             offsetof(ibd_mce_t, mc_list));
1037         state->id_ac_hot_ace = NULL;
1038 
1039         state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1040             state->id_num_ah, KM_SLEEP);
1041         for (i = 0; i < state->id_num_ah; i++, ce++) {
1042                 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1043                     state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1044                         mutex_exit(&state->id_ac_mutex);
1045                         ibd_acache_fini(state);
1046                         return (DDI_FAILURE);
1047                 } else {
1048                         CLEAR_REFCYCLE(ce);
1049                         ce->ac_mce = NULL;
1050                         mutex_init(&ce->tx_too_big_mutex, NULL,
1051                             MUTEX_DRIVER, NULL);
1052                         IBD_ACACHE_INSERT_FREE(state, ce);
1053                 }
1054         }
1055         mutex_exit(&state->id_ac_mutex);
1056         return (DDI_SUCCESS);
1057 }
1058 
1059 static void
1060 ibd_acache_fini(ibd_state_t *state)
1061 {
1062         ibd_ace_t *ptr;
1063 
1064         mutex_enter(&state->id_ac_mutex);
1065 
1066         while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1067                 ASSERT(GET_REF(ptr) == 0);
1068                 mutex_destroy(&ptr->tx_too_big_mutex);
1069                 (void) ibt_free_ud_dest(ptr->ac_dest);
1070         }
1071 
1072         while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1073                 ASSERT(GET_REF(ptr) == 0);
1074                 mutex_destroy(&ptr->tx_too_big_mutex);
1075                 (void) ibt_free_ud_dest(ptr->ac_dest);
1076         }
1077 
1078         list_destroy(&state->id_ah_free);
1079         list_destroy(&state->id_ah_active);
1080         list_destroy(&state->id_mc_full);
1081         list_destroy(&state->id_mc_non);
1082         kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1083         mutex_exit(&state->id_ac_mutex);
1084         mutex_destroy(&state->id_ac_mutex);
1085         mutex_destroy(&state->id_mc_mutex);
1086 }
1087 
1088 /*
1089  * Search AH active hash list for a cached path to input destination.
1090  * If we are "just looking", hold == F. When we are in the Tx path,
1091  * we set hold == T to grab a reference on the AH so that it can not
1092  * be recycled to a new destination while the Tx request is posted.
1093  */
1094 ibd_ace_t *
1095 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1096 {
1097         ibd_ace_t *ptr;
1098 
1099         ASSERT(mutex_owned(&state->id_ac_mutex));
1100 
1101         /*
1102          * Do hash search.
1103          */
1104         if (mod_hash_find(state->id_ah_active_hash,
1105             (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1106                 if (hold)
1107                         INC_REF(ptr, num);
1108                 return (ptr);
1109         }
1110         return (NULL);
1111 }
1112 
1113 /*
1114  * This is called by the tx side; if an initialized AH is found in
1115  * the active list, it is locked down and can be used; if no entry
1116  * is found, an async request is queued to do path resolution.
1117  */
1118 static ibd_ace_t *
1119 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1120 {
1121         ibd_ace_t *ptr;
1122         ibd_req_t *req;
1123 
1124         /*
1125          * Only attempt to print when we can; in the mdt pattr case, the
1126          * address is not aligned properly.
1127          */
1128         if (((ulong_t)mac & 3) == 0) {
1129                 DPRINT(4,
1130                     "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1131                     htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1132                     htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1133                     htonl(mac->ipoib_gidsuff[1]));
1134         }
1135 
1136         mutex_enter(&state->id_ac_mutex);
1137 
1138         if (((ptr = state->id_ac_hot_ace) != NULL) &&
1139             (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1140                 INC_REF(ptr, numwqe);
1141                 mutex_exit(&state->id_ac_mutex);
1142                 return (ptr);
1143         }
1144         if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1145                 state->id_ac_hot_ace = ptr;
1146                 mutex_exit(&state->id_ac_mutex);
1147                 return (ptr);
1148         }
1149 
1150         /*
1151          * Implementation of a single outstanding async request; if
1152          * the operation is not started yet, queue a request and move
1153          * to ongoing state. Remember in id_ah_addr for which address
1154          * we are queueing the request, in case we need to flag an error;
1155          * Any further requests, for the same or different address, until
1156          * the operation completes, is sent back to GLDv3 to be retried.
1157          * The async thread will update id_ah_op with an error indication
1158          * or will set it to indicate the next look up can start; either
1159          * way, it will mac_tx_update() so that all blocked requests come
1160          * back here.
1161          */
1162         *err = EAGAIN;
1163         if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1164                 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1165                 if (req != NULL) {
1166                         /*
1167                          * We did not even find the entry; queue a request
1168                          * for it.
1169                          */
1170                         bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1171                         state->id_ah_op = IBD_OP_ONGOING;
1172                         ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1173                         bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1174                 }
1175         } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1176             (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1177                 /*
1178                  * Check the status of the pathrecord lookup request
1179                  * we had queued before.
1180                  */
1181                 if (state->id_ah_op == IBD_OP_ERRORED) {
1182                         *err = EFAULT;
1183                         state->id_ah_error++;
1184                 } else {
1185                         /*
1186                          * IBD_OP_ROUTERED case: We need to send to the
1187                          * all-router MCG. If we can find the AH for
1188                          * the mcg, the Tx will be attempted. If we
1189                          * do not find the AH, we return NORESOURCES
1190                          * to retry.
1191                          */
1192                         ipoib_mac_t routermac;
1193 
1194                         (void) ibd_get_allroutergroup(state, mac, &routermac);
1195                         ptr = ibd_acache_find(state, &routermac, B_TRUE,
1196                             numwqe);
1197                 }
1198                 state->id_ah_op = IBD_OP_NOTSTARTED;
1199         } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1200             (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1201                 /*
1202                  * This case can happen when we get a higher band
1203                  * packet. The easiest way is to reset the state machine
1204                  * to accommodate the higher priority packet.
1205                  */
1206                 state->id_ah_op = IBD_OP_NOTSTARTED;
1207         }
1208         mutex_exit(&state->id_ac_mutex);
1209 
1210         return (ptr);
1211 }
1212 
1213 /*
1214  * Grab a not-currently-in-use AH/PathRecord from the active
1215  * list to recycle to a new destination. Only the async thread
1216  * executes this code.
1217  */
1218 static ibd_ace_t *
1219 ibd_acache_get_unref(ibd_state_t *state)
1220 {
1221         ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1222         boolean_t try_rc_chan_recycle = B_FALSE;
1223 
1224         ASSERT(mutex_owned(&state->id_ac_mutex));
1225 
1226         /*
1227          * Do plain linear search.
1228          */
1229         while (ptr != NULL) {
1230                 /*
1231                  * Note that it is possible that the "cycle" bit
1232                  * is set on the AH w/o any reference count. The
1233                  * mcg must have been deleted, and the tx cleanup
1234                  * just decremented the reference count to 0, but
1235                  * hasn't gotten around to grabbing the id_ac_mutex
1236                  * to move the AH into the free list.
1237                  */
1238                 if (GET_REF(ptr) == 0) {
1239                         if (ptr->ac_chan != NULL) {
1240                                 ASSERT(state->id_enable_rc == B_TRUE);
1241                                 if (!try_rc_chan_recycle) {
1242                                         try_rc_chan_recycle = B_TRUE;
1243                                         ibd_rc_signal_ace_recycle(state, ptr);
1244                                 }
1245                         } else {
1246                                 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1247                                 break;
1248                         }
1249                 }
1250                 ptr = list_prev(&state->id_ah_active, ptr);
1251         }
1252         return (ptr);
1253 }
1254 
1255 /*
1256  * Invoked to clean up AH from active list in case of multicast
1257  * disable and to handle sendonly memberships during mcg traps.
1258  * And for port up processing for multicast and unicast AHs.
1259  * Normally, the AH is taken off the active list, and put into
1260  * the free list to be recycled for a new destination. In case
1261  * Tx requests on the AH have not completed yet, the AH is marked
1262  * for reaping (which will put the AH on the free list) once the Tx's
1263  * complete; in this case, depending on the "force" input, we take
1264  * out the AH from the active list right now, or leave it also for
1265  * the reap operation. Returns TRUE if the AH is taken off the active
1266  * list (and either put into the free list right now, or arranged for
1267  * later), FALSE otherwise.
1268  */
1269 boolean_t
1270 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1271 {
1272         ibd_ace_t *acactive;
1273         boolean_t ret = B_TRUE;
1274 
1275         ASSERT(mutex_owned(&state->id_ac_mutex));
1276 
1277         if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1278 
1279                 /*
1280                  * Note that the AH might already have the cycle bit set
1281                  * on it; this might happen if sequences of multicast
1282                  * enables and disables are coming so fast, that posted
1283                  * Tx's to the mcg have not completed yet, and the cycle
1284                  * bit is set successively by each multicast disable.
1285                  */
1286                 if (SET_CYCLE_IF_REF(acactive)) {
1287                         if (!force) {
1288                                 /*
1289                                  * The ace is kept on the active list, further
1290                                  * Tx's can still grab a reference on it; the
1291                                  * ace is reaped when all pending Tx's
1292                                  * referencing the AH complete.
1293                                  */
1294                                 ret = B_FALSE;
1295                         } else {
1296                                 /*
1297                                  * In the mcg trap case, we always pull the
1298                                  * AH from the active list. And also the port
1299                                  * up multi/unicast case.
1300                                  */
1301                                 ASSERT(acactive->ac_chan == NULL);
1302                                 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1303                                 acactive->ac_mce = NULL;
1304                         }
1305                 } else {
1306                         /*
1307                          * Determined the ref count is 0, thus reclaim
1308                          * immediately after pulling out the ace from
1309                          * the active list.
1310                          */
1311                         ASSERT(acactive->ac_chan == NULL);
1312                         IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1313                         acactive->ac_mce = NULL;
1314                         IBD_ACACHE_INSERT_FREE(state, acactive);
1315                 }
1316 
1317         }
1318         return (ret);
1319 }
1320 
1321 /*
1322  * Helper function for async path record lookup. If we are trying to
1323  * Tx to a MCG, check our membership, possibly trying to join the
1324  * group if required. If that fails, try to send the packet to the
1325  * all router group (indicated by the redirect output), pointing
1326  * the input mac address to the router mcg address.
1327  */
1328 static ibd_mce_t *
1329 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1330 {
1331         ib_gid_t mgid;
1332         ibd_mce_t *mce;
1333         ipoib_mac_t routermac;
1334 
1335         *redirect = B_FALSE;
1336         ibd_n2h_gid(mac, &mgid);
1337 
1338         /*
1339          * Check the FullMember+SendOnlyNonMember list.
1340          * Since we are the only one who manipulates the
1341          * id_mc_full list, no locks are needed.
1342          */
1343         mce = IBD_MCACHE_FIND_FULL(state, mgid);
1344         if (mce != NULL) {
1345                 DPRINT(4, "ibd_async_mcache : already joined to group");
1346                 return (mce);
1347         }
1348 
1349         /*
1350          * Not found; try to join(SendOnlyNonMember) and attach.
1351          */
1352         DPRINT(4, "ibd_async_mcache : not joined to group");
1353         if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1354             NULL) {
1355                 DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1356                 return (mce);
1357         }
1358 
1359         /*
1360          * MCGroup not present; try to join the all-router group. If
1361          * any of the following steps succeed, we will be redirecting
1362          * to the all router group.
1363          */
1364         DPRINT(4, "ibd_async_mcache : nonmem join failed");
1365         if (!ibd_get_allroutergroup(state, mac, &routermac))
1366                 return (NULL);
1367         *redirect = B_TRUE;
1368         ibd_n2h_gid(&routermac, &mgid);
1369         bcopy(&routermac, mac, IPOIB_ADDRL);
1370         DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1371             mgid.gid_prefix, mgid.gid_guid);
1372 
1373         /*
1374          * Are we already joined to the router group?
1375          */
1376         if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1377                 DPRINT(4, "ibd_async_mcache : using already joined router"
1378                     "group\n");
1379                 return (mce);
1380         }
1381 
1382         /*
1383          * Can we join(SendOnlyNonMember) the router group?
1384          */
1385         DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1386         if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1387             NULL) {
1388                 DPRINT(4, "ibd_async_mcache : joined to router grp");
1389                 return (mce);
1390         }
1391 
1392         return (NULL);
1393 }
1394 
1395 /*
1396  * Async path record lookup code.
1397  */
1398 static void
1399 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1400 {
1401         ibd_ace_t *ce;
1402         ibd_mce_t *mce = NULL;
1403         ibt_path_attr_t path_attr;
1404         ibt_path_info_t path_info;
1405         ib_gid_t destgid;
1406         char ret = IBD_OP_NOTSTARTED;
1407 
1408         DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1409             htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1410             htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1411             htonl(mac->ipoib_gidsuff[1]));
1412 
1413         /*
1414          * Check whether we are trying to transmit to a MCG.
1415          * In that case, we need to make sure we are a member of
1416          * the MCG.
1417          */
1418         if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1419                 boolean_t redirected;
1420 
1421                 /*
1422                  * If we can not find or join the group or even
1423                  * redirect, error out.
1424                  */
1425                 if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1426                     NULL) {
1427                         state->id_ah_op = IBD_OP_ERRORED;
1428                         return;
1429                 }
1430 
1431                 /*
1432                  * If we got redirected, we need to determine whether
1433                  * the AH for the new mcg is in the cache already, and
1434                  * not pull it in then; otherwise proceed to get the
1435                  * path for the new mcg. There is no guarantee that
1436                  * if the AH is currently in the cache, it will still be
1437                  * there when we look in ibd_acache_lookup(), but that's
1438                  * okay, we will come back here.
1439                  */
1440                 if (redirected) {
1441                         ret = IBD_OP_ROUTERED;
1442                         DPRINT(4, "ibd_async_acache :  redirected to "
1443                             "%08X:%08X:%08X:%08X:%08X",
1444                             htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1445                             htonl(mac->ipoib_gidpref[1]),
1446                             htonl(mac->ipoib_gidsuff[0]),
1447                             htonl(mac->ipoib_gidsuff[1]));
1448 
1449                         mutex_enter(&state->id_ac_mutex);
1450                         if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1451                                 state->id_ah_op = IBD_OP_ROUTERED;
1452                                 mutex_exit(&state->id_ac_mutex);
1453                                 DPRINT(4, "ibd_async_acache : router AH found");
1454                                 return;
1455                         }
1456                         mutex_exit(&state->id_ac_mutex);
1457                 }
1458         }
1459 
1460         /*
1461          * Get an AH from the free list.
1462          */
1463         mutex_enter(&state->id_ac_mutex);
1464         if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1465                 /*
1466                  * No free ones; try to grab an unreferenced active
1467                  * one. Maybe we need to make the active list LRU,
1468                  * but that will create more work for Tx callbacks.
1469                  * Is there a way of not having to pull out the
1470                  * entry from the active list, but just indicate it
1471                  * is being recycled? Yes, but that creates one more
1472                  * check in the fast lookup path.
1473                  */
1474                 if ((ce = ibd_acache_get_unref(state)) == NULL) {
1475                         /*
1476                          * Pretty serious shortage now.
1477                          */
1478                         state->id_ah_op = IBD_OP_NOTSTARTED;
1479                         mutex_exit(&state->id_ac_mutex);
1480                         DPRINT(10, "ibd_async_acache : failed to find AH "
1481                             "slot\n");
1482                         return;
1483                 }
1484                 /*
1485                  * We could check whether ac_mce points to a SendOnly
1486                  * member and drop that membership now. Or do it lazily
1487                  * at detach time.
1488                  */
1489                 ce->ac_mce = NULL;
1490         }
1491         mutex_exit(&state->id_ac_mutex);
1492         ASSERT(ce->ac_mce == NULL);
1493 
1494         /*
1495          * Update the entry.
1496          */
1497         bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1498 
1499         bzero(&path_info, sizeof (path_info));
1500         bzero(&path_attr, sizeof (ibt_path_attr_t));
1501         path_attr.pa_sgid = state->id_sgid;
1502         path_attr.pa_num_dgids = 1;
1503         ibd_n2h_gid(&ce->ac_mac, &destgid);
1504         path_attr.pa_dgids = &destgid;
1505         path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1506         path_attr.pa_pkey = state->id_pkey;
1507         if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1508             &path_info, NULL) != IBT_SUCCESS) {
1509                 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1510                 goto error;
1511         }
1512         if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1513             ntohl(ce->ac_mac.ipoib_qpn),
1514             &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1515                 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1516                 goto error;
1517         }
1518 
1519         /*
1520          * mce is set whenever an AH is being associated with a
1521          * MCG; this will come in handy when we leave the MCG. The
1522          * lock protects Tx fastpath from scanning the active list.
1523          */
1524         if (mce != NULL)
1525                 ce->ac_mce = mce;
1526 
1527         /*
1528          * initiate a RC mode connection for unicast address
1529          */
1530         if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1531             (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1532                 ASSERT(ce->ac_chan == NULL);
1533                 DPRINT(10, "ibd_async_acache: call "
1534                     "ibd_rc_try_connect(ace=%p)", ce);
1535                 ibd_rc_try_connect(state, ce, &path_info);
1536                 if (ce->ac_chan == NULL) {
1537                         DPRINT(10, "ibd_async_acache: fail to setup RC"
1538                             " channel");
1539                         state->rc_conn_fail++;
1540                         goto error;
1541                 }
1542         }
1543 
1544         mutex_enter(&state->id_ac_mutex);
1545         IBD_ACACHE_INSERT_ACTIVE(state, ce);
1546         state->id_ah_op = ret;
1547         mutex_exit(&state->id_ac_mutex);
1548         return;
1549 error:
1550         /*
1551          * We might want to drop SendOnly membership here if we
1552          * joined above. The lock protects Tx callbacks inserting
1553          * into the free list.
1554          */
1555         mutex_enter(&state->id_ac_mutex);
1556         state->id_ah_op = IBD_OP_ERRORED;
1557         IBD_ACACHE_INSERT_FREE(state, ce);
1558         mutex_exit(&state->id_ac_mutex);
1559 }
1560 
1561 /*
1562  * While restoring port's presence on the subnet on a port up, it is possible
1563  * that the port goes down again.
1564  */
1565 static void
1566 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1567 {
1568         ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1569         link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1570             LINK_STATE_UP;
1571         ibd_mce_t *mce, *pmce;
1572         ibd_ace_t *ace, *pace;
1573 
1574         DPRINT(10, "ibd_async_link(): %d", opcode);
1575 
1576         /*
1577          * On a link up, revalidate the link speed/width. No point doing
1578          * this on a link down, since we will be unable to do SA operations,
1579          * defaulting to the lowest speed. Also notice that we update our
1580          * notion of speed before calling mac_link_update(), which will do
1581          * necessary higher level notifications for speed changes.
1582          */
1583         if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1584                 state->id_link_speed = ibd_get_portspeed(state);
1585         }
1586 
1587         /*
1588          * Do all the work required to establish our presence on
1589          * the subnet.
1590          */
1591         if (opcode == IBD_LINK_UP_ABSENT) {
1592                 /*
1593                  * If in promiscuous mode ...
1594                  */
1595                 if (state->id_prom_op == IBD_OP_COMPLETED) {
1596                         /*
1597                          * Drop all nonmembership.
1598                          */
1599                         ibd_async_unsetprom(state);
1600 
1601                         /*
1602                          * Then, try to regain nonmembership to all mcg's.
1603                          */
1604                         ibd_async_setprom(state);
1605 
1606                 }
1607 
1608                 /*
1609                  * Drop all sendonly membership (which also gets rid of the
1610                  * AHs); try to reacquire all full membership.
1611                  */
1612                 mce = list_head(&state->id_mc_full);
1613                 while ((pmce = mce) != NULL) {
1614                         mce = list_next(&state->id_mc_full, mce);
1615                         if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1616                                 ibd_leave_group(state,
1617                                     pmce->mc_info.mc_adds_vect.av_dgid,
1618                                     IB_MC_JSTATE_SEND_ONLY_NON);
1619                         else
1620                                 ibd_reacquire_group(state, pmce);
1621                 }
1622 
1623                 /*
1624                  * Recycle all active AHs to free list (and if there are
1625                  * pending posts, make sure they will go into the free list
1626                  * once the Tx's complete). Grab the lock to prevent
1627                  * concurrent Tx's as well as Tx cleanups.
1628                  */
1629                 mutex_enter(&state->id_ac_mutex);
1630                 ace = list_head(&state->id_ah_active);
1631                 while ((pace = ace) != NULL) {
1632                         boolean_t cycled;
1633 
1634                         ace = list_next(&state->id_ah_active, ace);
1635                         mce = pace->ac_mce;
1636                         if (pace->ac_chan != NULL) {
1637                                 ASSERT(mce == NULL);
1638                                 ASSERT(state->id_enable_rc == B_TRUE);
1639                                 if (pace->ac_chan->chan_state ==
1640                                     IBD_RC_STATE_ACT_ESTAB) {
1641                                         INC_REF(pace, 1);
1642                                         IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
1643                                         pace->ac_chan->chan_state =
1644                                             IBD_RC_STATE_ACT_CLOSING;
1645                                         ibd_rc_signal_act_close(state, pace);
1646                                 } else {
1647                                         state->rc_act_close_simultaneous++;
1648                                         DPRINT(40, "ibd_async_link: other "
1649                                             "thread is closing it, ace=%p, "
1650                                             "ac_chan=%p, chan_state=%d",
1651                                             pace, pace->ac_chan,
1652                                             pace->ac_chan->chan_state);
1653                                 }
1654                         } else {
1655                                 cycled = ibd_acache_recycle(state,
1656                                     &pace->ac_mac, B_TRUE);
1657                         }
1658                         /*
1659                          * If this is for an mcg, it must be for a fullmember,
1660                          * since we got rid of send-only members above when
1661                          * processing the mce list.
1662                          */
1663                         ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1664                             IB_MC_JSTATE_FULL)));
1665 
1666                         /*
1667                          * Check if the fullmember mce needs to be torn down,
1668                          * ie whether the DLPI disable has already been done.
1669                          * If so, do some of the work of tx_cleanup, namely
1670                          * causing leave (which will fail), detach and
1671                          * mce-freeing. tx_cleanup will put the AH into free
1672                          * list. The reason to duplicate some of this
1673                          * tx_cleanup work is because we want to delete the
1674                          * AH right now instead of waiting for tx_cleanup, to
1675                          * force subsequent Tx's to reacquire an AH.
1676                          */
1677                         if ((mce != NULL) && (mce->mc_fullreap))
1678                                 ibd_async_reap_group(state, mce,
1679                                     mce->mc_info.mc_adds_vect.av_dgid,
1680                                     mce->mc_jstate);
1681                 }
1682                 mutex_exit(&state->id_ac_mutex);
1683         }
1684 
1685         /*
1686          * mac handle is guaranteed to exist since driver does ibt_close_hca()
1687          * (which stops further events from being delivered) before
1688          * mac_unregister(). At this point, it is guaranteed that mac_register
1689          * has already been done.
1690          */
1691         mutex_enter(&state->id_link_mutex);
1692         state->id_link_state = lstate;
1693         mac_link_update(state->id_mh, lstate);
1694         mutex_exit(&state->id_link_mutex);
1695 
1696         ibd_async_done(state);
1697 }
1698 
1699 /*
1700  * Check the pkey table to see if we can find the pkey we're looking for.
1701  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1702  * failure.
1703  */
1704 static int
1705 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1706     uint16_t *pkix)
1707 {
1708         uint16_t ndx;
1709 
1710         ASSERT(pkix != NULL);
1711 
1712         for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1713                 if (pkey_tbl[ndx] == pkey) {
1714                         *pkix = ndx;
1715                         return (0);
1716                 }
1717         }
1718         return (-1);
1719 }
1720 
1721 /*
1722  * Late HCA Initialization:
1723  * If plumb had succeeded without the availability of an active port or the
1724  * pkey, and either of their availability is now being indicated via PORT_UP
1725  * or PORT_CHANGE respectively, try a start of the interface.
1726  *
1727  * Normal Operation:
1728  * When the link is notified up, we need to do a few things, based
1729  * on the port's current p_init_type_reply claiming a reinit has been
1730  * done or not. The reinit steps are:
1731  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1732  *    the old Pkey and GID0 are correct.
1733  * 2. Register for mcg traps (already done by ibmf).
1734  * 3. If PreservePresenceReply indicates the SM has restored port's presence
1735  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
1736  * 4. Give up all sendonly memberships.
1737  * 5. Acquire all full memberships.
1738  * 6. In promiscuous mode, acquire all non memberships.
1739  * 7. Recycle all AHs to free list.
1740  */
1741 static void
1742 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
1743 {
1744         ibt_hca_portinfo_t *port_infop = NULL;
1745         ibt_status_t ibt_status;
1746         uint_t psize, port_infosz;
1747         ibd_link_op_t opcode;
1748         ibd_req_t *req;
1749         link_state_t new_link_state = LINK_STATE_UP;
1750         uint8_t itreply;
1751         uint16_t pkix;
1752         int ret;
1753 
1754         /*
1755          * Let's not race with a plumb or an unplumb; if we detect a
1756          * pkey relocation event later on here, we may have to restart.
1757          */
1758         ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
1759 
1760         mutex_enter(&state->id_link_mutex);
1761 
1762         /*
1763          * If the link state is unknown, a plumb has not yet been attempted
1764          * on the interface. Nothing to do.
1765          */
1766         if (state->id_link_state == LINK_STATE_UNKNOWN) {
1767                 mutex_exit(&state->id_link_mutex);
1768                 goto link_mod_return;
1769         }
1770 
1771         /*
1772          * If link state is down because of plumb failure, and we are not in
1773          * late HCA init, and we were not successfully plumbed, nothing to do.
1774          */
1775         if ((state->id_link_state == LINK_STATE_DOWN) &&
1776             ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
1777             ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
1778                 mutex_exit(&state->id_link_mutex);
1779                 goto link_mod_return;
1780         }
1781 
1782         /*
1783          * If this routine was called in response to a port down event,
1784          * we just need to see if this should be informed.
1785          */
1786         if (code == IBT_ERROR_PORT_DOWN) {
1787                 new_link_state = LINK_STATE_DOWN;
1788                 goto update_link_state;
1789         }
1790 
1791         /*
1792          * If it's not a port down event we've received, try to get the port
1793          * attributes first. If we fail here, the port is as good as down.
1794          * Otherwise, if the link went down by the time the handler gets
1795          * here, give up - we cannot even validate the pkey/gid since those
1796          * are not valid and this is as bad as a port down anyway.
1797          */
1798         ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
1799             &port_infop, &psize, &port_infosz);
1800         if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
1801             (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
1802                 new_link_state = LINK_STATE_DOWN;
1803                 goto update_link_state;
1804         }
1805 
1806         /*
1807          * If in the previous attempt, the pkey was not found either due to the
1808          * port state being down, or due to it's absence in the pkey table,
1809          * look for it now and try to start the interface.
1810          */
1811         if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
1812                 mutex_exit(&state->id_link_mutex);
1813                 if ((ret = ibd_start(state)) != 0) {
1814                         DPRINT(10, "ibd_linkmod: cannot start from late HCA "
1815                             "init, ret=%d", ret);
1816                 }
1817                 ibt_free_portinfo(port_infop, port_infosz);
1818                 goto link_mod_return;
1819         }
1820 
1821         /*
1822          * Check the SM InitTypeReply flags. If both NoLoadReply and
1823          * PreserveContentReply are 0, we don't know anything about the
1824          * data loaded into the port attributes, so we need to verify
1825          * if gid0 and pkey are still valid.
1826          */
1827         itreply = port_infop->p_init_type_reply;
1828         if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
1829             ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
1830                 /*
1831                  * Check to see if the subnet part of GID0 has changed. If
1832                  * not, check the simple case first to see if the pkey
1833                  * index is the same as before; finally check to see if the
1834                  * pkey has been relocated to a different index in the table.
1835                  */
1836                 if (bcmp(port_infop->p_sgid_tbl,
1837                     &state->id_sgid, sizeof (ib_gid_t)) != 0) {
1838 
1839                         new_link_state = LINK_STATE_DOWN;
1840 
1841                 } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
1842                     state->id_pkey) {
1843 
1844                         new_link_state = LINK_STATE_UP;
1845 
1846                 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
1847                     port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
1848 
1849                         ibt_free_portinfo(port_infop, port_infosz);
1850                         mutex_exit(&state->id_link_mutex);
1851 
1852                         /*
1853                          * Currently a restart is required if our pkey has moved
1854                          * in the pkey table. If we get the ibt_recycle_ud() to
1855                          * work as documented (expected), we may be able to
1856                          * avoid a complete restart.  Note that we've already
1857                          * marked both the start and stop 'in-progress' flags,
1858                          * so it is ok to go ahead and do this restart.
1859                          */
1860                         (void) ibd_undo_start(state, LINK_STATE_DOWN);
1861                         if ((ret = ibd_start(state)) != 0) {
1862                                 DPRINT(10, "ibd_restart: cannot restart, "
1863                                     "ret=%d", ret);
1864                         }
1865 
1866                         goto link_mod_return;
1867                 } else {
1868                         new_link_state = LINK_STATE_DOWN;
1869                 }
1870         }
1871 
1872 update_link_state:
1873         if (port_infop) {
1874                 ibt_free_portinfo(port_infop, port_infosz);
1875         }
1876 
1877         /*
1878          * If we're reporting a link up, check InitTypeReply to see if
1879          * the SM has ensured that the port's presence in mcg, traps,
1880          * etc. is intact.
1881          */
1882         if (new_link_state == LINK_STATE_DOWN) {
1883                 opcode = IBD_LINK_DOWN;
1884         } else {
1885                 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
1886                     SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
1887                         opcode = IBD_LINK_UP;
1888                 } else {
1889                         opcode = IBD_LINK_UP_ABSENT;
1890                 }
1891         }
1892 
1893         /*
1894          * If the old state is the same as the new state, and the SM indicated
1895          * no change in the port parameters, nothing to do.
1896          */
1897         if ((state->id_link_state == new_link_state) && (opcode !=
1898             IBD_LINK_UP_ABSENT)) {
1899                 mutex_exit(&state->id_link_mutex);
1900                 goto link_mod_return;
1901         }
1902 
1903         /*
1904          * Ok, so there was a link state change; see if it's safe to ask
1905          * the async thread to do the work
1906          */
1907         if (!ibd_async_safe(state)) {
1908                 state->id_link_state = new_link_state;
1909                 mutex_exit(&state->id_link_mutex);
1910                 goto link_mod_return;
1911         }
1912 
1913         mutex_exit(&state->id_link_mutex);
1914 
1915         /*
1916          * Queue up a request for ibd_async_link() to handle this link
1917          * state change event
1918          */
1919         req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
1920         req->rq_ptr = (void *)opcode;
1921         ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
1922 
1923 link_mod_return:
1924         ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
1925 }
1926 
1927 /*
1928  * For the port up/down events, IBTL guarantees there will not be concurrent
1929  * invocations of the handler. IBTL might coalesce link transition events,
1930  * and not invoke the handler for _each_ up/down transition, but it will
1931  * invoke the handler with last known state
1932  */
1933 static void
1934 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1935     ibt_async_code_t code, ibt_async_event_t *event)
1936 {
1937         ibd_state_t *state = (ibd_state_t *)clnt_private;
1938 
1939         switch (code) {
1940         case IBT_ERROR_CATASTROPHIC_CHAN:
1941                 ibd_print_warn(state, "catastrophic channel error");
1942                 break;
1943         case IBT_ERROR_CQ:
1944                 ibd_print_warn(state, "completion queue error");
1945                 break;
1946         case IBT_PORT_CHANGE_EVENT:
1947                 /*
1948                  * Events will be delivered to all instances that have
1949                  * done ibt_open_hca() but not yet done ibt_close_hca().
1950                  * Only need to do work for our port; IBTF will deliver
1951                  * events for other ports on the hca we have ibt_open_hca'ed
1952                  * too. Note that id_port is initialized in ibd_attach()
1953                  * before we do an ibt_open_hca() in ibd_attach().
1954                  */
1955                 ASSERT(state->id_hca_hdl == hca_hdl);
1956                 if (state->id_port != event->ev_port)
1957                         break;
1958 
1959                 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
1960                     IBT_PORT_CHANGE_PKEY) {
1961                         ibd_link_mod(state, code);
1962                 }
1963                 break;
1964         case IBT_ERROR_PORT_DOWN:
1965         case IBT_CLNT_REREG_EVENT:
1966         case IBT_EVENT_PORT_UP:
1967                 /*
1968                  * Events will be delivered to all instances that have
1969                  * done ibt_open_hca() but not yet done ibt_close_hca().
1970                  * Only need to do work for our port; IBTF will deliver
1971                  * events for other ports on the hca we have ibt_open_hca'ed
1972                  * too. Note that id_port is initialized in ibd_attach()
1973                  * before we do an ibt_open_hca() in ibd_attach().
1974                  */
1975                 ASSERT(state->id_hca_hdl == hca_hdl);
1976                 if (state->id_port != event->ev_port)
1977                         break;
1978 
1979                 ibd_link_mod(state, code);
1980                 break;
1981 
1982         case IBT_HCA_ATTACH_EVENT:
1983         case IBT_HCA_DETACH_EVENT:
1984                 /*
1985                  * When a new card is plugged to the system, attach_event is
1986                  * invoked. Additionally, a cfgadm needs to be run to make the
1987                  * card known to the system, and an ifconfig needs to be run to
1988                  * plumb up any ibd interfaces on the card. In the case of card
1989                  * unplug, a cfgadm is run that will trigger any RCM scripts to
1990                  * unplumb the ibd interfaces on the card; when the card is
1991                  * actually unplugged, the detach_event is invoked;
1992                  * additionally, if any ibd instances are still active on the
1993                  * card (eg there were no associated RCM scripts), driver's
1994                  * detach routine is invoked.
1995                  */
1996                 break;
1997         default:
1998                 break;
1999         }
2000 }
2001 
2002 static int
2003 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2004 {
2005         mac_register_t *macp;
2006         int ret;
2007 
2008         if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2009                 DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2010                 return (DDI_FAILURE);
2011         }
2012 
2013         /*
2014          * Note that when we register with mac during attach, we don't
2015          * have the id_macaddr yet, so we'll simply be registering a
2016          * zero macaddr that we'll overwrite later during plumb (in
2017          * ibd_m_start()). Similar is the case with id_mtu - we'll
2018          * update the mac layer with the correct mtu during plumb.
2019          */
2020         macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2021         macp->m_driver = state;
2022         macp->m_dip = dip;
2023         macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2024         macp->m_callbacks = &ibd_m_callbacks;
2025         macp->m_min_sdu = 0;
2026         macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2027         if (state->id_type == IBD_PORT_DRIVER) {
2028                 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2029         } else if (state->id_enable_rc) {
2030                 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2031         } else {
2032                 macp->m_max_sdu = IBD_DEF_MAX_SDU;
2033         }
2034         macp->m_priv_props = ibd_priv_props;
2035 
2036         /*
2037          *  Register ourselves with the GLDv3 interface
2038          */
2039         if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2040                 mac_free(macp);
2041                 DPRINT(10,
2042                     "ibd_register_mac: mac_register() failed, ret=%d", ret);
2043                 return (DDI_FAILURE);
2044         }
2045 
2046         mac_free(macp);
2047         return (DDI_SUCCESS);
2048 }
2049 
2050 static int
2051 ibd_record_capab(ibd_state_t *state)
2052 {
2053         ibt_hca_attr_t hca_attrs;
2054         ibt_status_t ibt_status;
2055 
2056         /*
2057          * Query the HCA and fetch its attributes
2058          */
2059         ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2060         ASSERT(ibt_status == IBT_SUCCESS);
2061 
2062         /*
2063          * 1. Set the Hardware Checksum capability. Currently we only consider
2064          *    full checksum offload.
2065          */
2066         if (state->id_enable_rc) {
2067                         state->id_hwcksum_capab = 0;
2068         } else {
2069                 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2070                     == IBT_HCA_CKSUM_FULL) {
2071                         state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2072                 }
2073         }
2074 
2075         /*
2076          * 2. Set LSO policy, capability and maximum length
2077          */
2078         if (state->id_enable_rc) {
2079                 state->id_lso_capable = B_FALSE;
2080                 state->id_lso_maxlen = 0;
2081         } else {
2082                 if (hca_attrs.hca_max_lso_size > 0) {
2083                         state->id_lso_capable = B_TRUE;
2084                         if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2085                                 state->id_lso_maxlen = IBD_LSO_MAXLEN;
2086                         else
2087                                 state->id_lso_maxlen =
2088                                     hca_attrs.hca_max_lso_size;
2089                 } else {
2090                         state->id_lso_capable = B_FALSE;
2091                         state->id_lso_maxlen = 0;
2092                 }
2093         }
2094 
2095         /*
2096          * 3. Set Reserved L_Key capability
2097          */
2098         if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2099                 state->id_hca_res_lkey_capab = 1;
2100                 state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2101                 state->rc_enable_iov_map = B_TRUE;
2102         } else {
2103                 /* If no reserved lkey, we will not use ibt_map_mem_iov */
2104                 state->rc_enable_iov_map = B_FALSE;
2105         }
2106 
2107         /*
2108          * 4. Set maximum sqseg value after checking to see if extended sgl
2109          *    size information is provided by the hca
2110          */
2111         if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2112                 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2113                 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2114         } else {
2115                 state->id_max_sqseg = hca_attrs.hca_max_sgl;
2116                 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2117         }
2118         if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2119                 state->id_max_sqseg = IBD_MAX_SQSEG;
2120         } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2121                 ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2122                     state->id_max_sqseg, IBD_MAX_SQSEG);
2123         }
2124         if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2125                 state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2126         } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2127                 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2128                     "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2129         }
2130 
2131         /*
2132          * Translating the virtual address regions into physical regions
2133          * for using the Reserved LKey feature results in a wr sgl that
2134          * is a little longer. Since failing ibt_map_mem_iov() is costly,
2135          * we'll fix a high-water mark (65%) for when we should stop.
2136          */
2137         state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2138         state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2139 
2140         /*
2141          * 5. Set number of recv and send wqes after checking hca maximum
2142          *    channel size. Store the max channel size in the state so that it
2143          *    can be referred to when the swqe/rwqe change is requested via
2144          *    dladm.
2145          */
2146 
2147         state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2148 
2149         if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2150                 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2151 
2152         state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2153             IBD_RWQE_MIN;
2154 
2155         if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2156                 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2157 
2158         return (DDI_SUCCESS);
2159 }
2160 
2161 static int
2162 ibd_part_busy(ibd_state_t *state)
2163 {
2164         if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2165                 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2166                 return (DDI_FAILURE);
2167         }
2168 
2169         if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2170                 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2171                 return (DDI_FAILURE);
2172         }
2173 
2174         /*
2175          * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2176          * connecting to a remote IPoIB port. We can't remove this port.
2177          */
2178         if (state->id_ah_op == IBD_OP_ONGOING) {
2179                 DPRINT(10, "ibd_part_busy: failed: connecting\n");
2180                 return (DDI_FAILURE);
2181         }
2182 
2183         return (DDI_SUCCESS);
2184 }
2185 
2186 
2187 static void
2188 ibd_part_unattach(ibd_state_t *state)
2189 {
2190         uint32_t progress = state->id_mac_state;
2191         ibt_status_t ret;
2192 
2193         /* make sure rx resources are freed */
2194         ibd_free_rx_rsrcs(state);
2195 
2196         if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2197                 ASSERT(state->id_enable_rc);
2198                 ibd_rc_fini_srq_list(state);
2199                 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2200         }
2201 
2202         if (progress & IBD_DRV_MAC_REGISTERED) {
2203                 (void) mac_unregister(state->id_mh);
2204                 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2205         }
2206 
2207         if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2208                 /*
2209                  * No new async requests will be posted since the device
2210                  * link state has been marked as unknown; completion handlers
2211                  * have been turned off, so Tx handler will not cause any
2212                  * more IBD_ASYNC_REAP requests.
2213                  *
2214                  * Queue a request for the async thread to exit, which will
2215                  * be serviced after any pending ones. This can take a while,
2216                  * specially if the SM is unreachable, since IBMF will slowly
2217                  * timeout each SM request issued by the async thread.  Reap
2218                  * the thread before continuing on, we do not want it to be
2219                  * lingering in modunloaded code.
2220                  */
2221                 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2222                 thread_join(state->id_async_thrid);
2223 
2224                 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2225         }
2226 
2227         if (progress & IBD_DRV_REQ_LIST_INITED) {
2228                 list_destroy(&state->id_req_list);
2229                 mutex_destroy(&state->id_acache_req_lock);
2230                 cv_destroy(&state->id_acache_req_cv);
2231                 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2232         }
2233 
2234         if (progress & IBD_DRV_PD_ALLOCD) {
2235                 if ((ret = ibt_free_pd(state->id_hca_hdl,
2236                     state->id_pd_hdl)) != IBT_SUCCESS) {
2237                         ibd_print_warn(state, "failed to free "
2238                             "protection domain, ret=%d", ret);
2239                 }
2240                 state->id_pd_hdl = NULL;
2241                 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2242         }
2243 
2244         if (progress & IBD_DRV_HCA_OPENED) {
2245                 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2246                     IBT_SUCCESS) {
2247                         ibd_print_warn(state, "failed to close "
2248                             "HCA device, ret=%d", ret);
2249                 }
2250                 state->id_hca_hdl = NULL;
2251                 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2252         }
2253 
2254         mutex_enter(&ibd_gstate.ig_mutex);
2255         if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2256                 if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2257                     IBT_SUCCESS) {
2258                         ibd_print_warn(state,
2259                             "ibt_detach() failed, ret=%d", ret);
2260                 }
2261                 state->id_ibt_hdl = NULL;
2262                 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2263                 ibd_gstate.ig_ibt_hdl_ref_cnt--;
2264         }
2265         if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2266             (ibd_gstate.ig_ibt_hdl != NULL)) {
2267                 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2268                     IBT_SUCCESS) {
2269                         ibd_print_warn(state, "ibt_detach(): global "
2270                             "failed, ret=%d", ret);
2271                 }
2272                 ibd_gstate.ig_ibt_hdl = NULL;
2273         }
2274         mutex_exit(&ibd_gstate.ig_mutex);
2275 
2276         if (progress & IBD_DRV_TXINTR_ADDED) {
2277                 ddi_remove_softintr(state->id_tx);
2278                 state->id_tx = NULL;
2279                 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2280         }
2281 
2282         if (progress & IBD_DRV_RXINTR_ADDED) {
2283                 ddi_remove_softintr(state->id_rx);
2284                 state->id_rx = NULL;
2285                 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2286         }
2287 
2288 #ifdef DEBUG
2289         if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2290                 kstat_delete(state->rc_ksp);
2291                 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2292         }
2293 #endif
2294 
2295         if (progress & IBD_DRV_STATE_INITIALIZED) {
2296                 ibd_state_fini(state);
2297                 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2298         }
2299 }
2300 
2301 int
2302 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2303 {
2304         ibt_status_t ret;
2305         int rv;
2306         kthread_t *kht;
2307 
2308         /*
2309          * Initialize mutexes and condition variables
2310          */
2311         if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2312                 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2313                 return (DDI_FAILURE);
2314         }
2315         state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2316 
2317         /*
2318          * Allocate rx,tx softintr
2319          */
2320         if (ibd_rx_softintr == 1) {
2321                 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2322                     NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2323                         DPRINT(10, "ibd_part_attach: failed in "
2324                             "ddi_add_softintr(id_rx),  ret=%d", rv);
2325                         return (DDI_FAILURE);
2326                 }
2327                 state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2328         }
2329         if (ibd_tx_softintr == 1) {
2330                 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2331                     NULL, NULL, ibd_tx_recycle,
2332                     (caddr_t)state)) != DDI_SUCCESS) {
2333                         DPRINT(10, "ibd_part_attach: failed in "
2334                             "ddi_add_softintr(id_tx), ret=%d", rv);
2335                         return (DDI_FAILURE);
2336                 }
2337                 state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2338         }
2339 
2340         /*
2341          * Attach to IBTL
2342          */
2343         mutex_enter(&ibd_gstate.ig_mutex);
2344         if (ibd_gstate.ig_ibt_hdl == NULL) {
2345                 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2346                     &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2347                         DPRINT(10, "ibd_part_attach: global: failed in "
2348                             "ibt_attach(), ret=%d", ret);
2349                         mutex_exit(&ibd_gstate.ig_mutex);
2350                         return (DDI_FAILURE);
2351                 }
2352         }
2353         if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2354             &state->id_ibt_hdl)) != IBT_SUCCESS) {
2355                 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2356                     ret);
2357                 mutex_exit(&ibd_gstate.ig_mutex);
2358                 return (DDI_FAILURE);
2359         }
2360         ibd_gstate.ig_ibt_hdl_ref_cnt++;
2361         mutex_exit(&ibd_gstate.ig_mutex);
2362         state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2363 
2364         /*
2365          * Open the HCA
2366          */
2367         if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2368             &state->id_hca_hdl)) != IBT_SUCCESS) {
2369                 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2370                     ret);
2371                 return (DDI_FAILURE);
2372         }
2373         state->id_mac_state |= IBD_DRV_HCA_OPENED;
2374 
2375 #ifdef DEBUG
2376         /* Initialize Driver Counters for Reliable Connected Mode */
2377         if (state->id_enable_rc) {
2378                 if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2379                         DPRINT(10, "ibd_part_attach: failed in "
2380                             "ibd_rc_init_stats");
2381                         return (DDI_FAILURE);
2382                 }
2383                 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2384         }
2385 #endif
2386 
2387         /*
2388          * Record capabilities
2389          */
2390         (void) ibd_record_capab(state);
2391 
2392         /*
2393          * Allocate a protection domain on the HCA
2394          */
2395         if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2396             &state->id_pd_hdl)) != IBT_SUCCESS) {
2397                 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2398                     ret);
2399                 return (DDI_FAILURE);
2400         }
2401         state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2402 
2403 
2404         /*
2405          * We need to initialise the req_list that is required for the
2406          * operation of the async_thread.
2407          */
2408         mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2409         cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2410         list_create(&state->id_req_list, sizeof (ibd_req_t),
2411             offsetof(ibd_req_t, rq_list));
2412         state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2413 
2414         /*
2415          * Create the async thread; thread_create never fails.
2416          */
2417         kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2418             TS_RUN, minclsyspri);
2419         state->id_async_thrid = kht->t_did;
2420         state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2421 
2422         return (DDI_SUCCESS);
2423 }
2424 
2425 /*
2426  * Attach device to the IO framework.
2427  */
2428 static int
2429 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2430 {
2431         int ret;
2432 
2433         switch (cmd) {
2434                 case DDI_ATTACH:
2435                         ret = ibd_port_attach(dip);
2436                         break;
2437                 default:
2438                         ret = DDI_FAILURE;
2439                         break;
2440         }
2441         return (ret);
2442 }
2443 
2444 /*
2445  * Detach device from the IO framework.
2446  */
2447 static int
2448 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2449 {
2450         ibd_state_t *state;
2451         int instance;
2452 
2453         /*
2454          * IBD doesn't support suspend/resume
2455          */
2456         if (cmd != DDI_DETACH)
2457                 return (DDI_FAILURE);
2458 
2459         /*
2460          * Get the instance softstate
2461          */
2462         instance = ddi_get_instance(dip);
2463         state = ddi_get_soft_state(ibd_list, instance);
2464 
2465         /*
2466          * Release all resources we're holding still.  Note that if we'd
2467          * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2468          * so far, we should find all the flags we need in id_mac_state.
2469          */
2470         return (ibd_port_unattach(state, dip));
2471 }
2472 
2473 /*
2474  * Pre ibt_attach() driver initialization
2475  */
2476 static int
2477 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2478 {
2479         char buf[64];
2480 
2481         mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2482         state->id_link_state = LINK_STATE_UNKNOWN;
2483 
2484         mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2485         cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2486         state->id_trap_stop = B_TRUE;
2487         state->id_trap_inprog = 0;
2488 
2489         mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2490         mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2491         state->id_dip = dip;
2492 
2493         mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2494 
2495         mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2496         mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2497         mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2498         state->id_tx_busy = 0;
2499         mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2500 
2501         state->id_rx_list.dl_bufs_outstanding = 0;
2502         state->id_rx_list.dl_cnt = 0;
2503         mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2504         mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2505         (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2506             state->id_pkey, state->id_plinkid);
2507         state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2508             0, NULL, NULL, NULL, NULL, NULL, 0);
2509 
2510         /* For Reliable Connected Mode */
2511         mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2512         mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2513         mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2514         mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2515         mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2516             MUTEX_DRIVER, NULL);
2517         mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2518 
2519         /*
2520          * Make the default link mode as RC. If this fails during connection
2521          * setup, the link mode is automatically transitioned to UD.
2522          * Also set the RC MTU.
2523          */
2524         state->id_enable_rc = IBD_DEF_LINK_MODE;
2525         state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2526         state->id_mtu = IBD_DEF_MAX_MTU;
2527 
2528         /* Iniatialize all tunables to default */
2529         state->id_lso_policy = IBD_DEF_LSO_POLICY;
2530         state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2531         state->id_num_ah = IBD_DEF_NUM_AH;
2532         state->id_hash_size = IBD_DEF_HASH_SIZE;
2533         state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2534         state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2535         state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2536         state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2537         state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2538         state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2539         state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2540         state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2541         state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2542         state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2543         state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2544         state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2545         state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2546         state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2547         state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2548         state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2549         state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2550         state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2551         state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2552         state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2553 
2554         return (DDI_SUCCESS);
2555 }
2556 
2557 /*
2558  * Post ibt_detach() driver deconstruction
2559  */
2560 static void
2561 ibd_state_fini(ibd_state_t *state)
2562 {
2563         kmem_cache_destroy(state->id_req_kmc);
2564 
2565         mutex_destroy(&state->id_rx_list.dl_mutex);
2566         mutex_destroy(&state->id_rx_free_list.dl_mutex);
2567 
2568         mutex_destroy(&state->id_txpost_lock);
2569         mutex_destroy(&state->id_tx_list.dl_mutex);
2570         mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2571         mutex_destroy(&state->id_lso_lock);
2572 
2573         mutex_destroy(&state->id_sched_lock);
2574         mutex_destroy(&state->id_scq_poll_lock);
2575         mutex_destroy(&state->id_rcq_poll_lock);
2576 
2577         cv_destroy(&state->id_trap_cv);
2578         mutex_destroy(&state->id_trap_lock);
2579         mutex_destroy(&state->id_link_mutex);
2580 
2581         /* For Reliable Connected Mode */
2582         mutex_destroy(&state->rc_timeout_lock);
2583         mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2584         mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2585         mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2586         mutex_destroy(&state->rc_tx_large_bufs_lock);
2587         mutex_destroy(&state->rc_rx_lock);
2588 }
2589 
2590 /*
2591  * Fetch link speed from SA for snmp ifspeed reporting.
2592  */
2593 static uint64_t
2594 ibd_get_portspeed(ibd_state_t *state)
2595 {
2596         int                     ret;
2597         ibt_path_info_t         path;
2598         ibt_path_attr_t         path_attr;
2599         uint8_t                 num_paths;
2600         uint64_t                ifspeed;
2601 
2602         /*
2603          * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2604          * translates to 2 Gbps data rate. Thus, 1X single data rate is
2605          * 2000000000. Start with that as default.
2606          */
2607         ifspeed = 2000000000;
2608 
2609         bzero(&path_attr, sizeof (path_attr));
2610 
2611         /*
2612          * Get the port speed from Loopback path information.
2613          */
2614         path_attr.pa_dgids = &state->id_sgid;
2615         path_attr.pa_num_dgids = 1;
2616         path_attr.pa_sgid = state->id_sgid;
2617 
2618         if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2619             &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2620                 goto earlydone;
2621 
2622         if (num_paths < 1)
2623                 goto earlydone;
2624 
2625         /*
2626          * In case SA does not return an expected value, report the default
2627          * speed as 1X.
2628          */
2629         ret = 1;
2630         switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2631                 case IBT_SRATE_2:       /*  1X SDR i.e 2.5 Gbps */
2632                         ret = 1;
2633                         break;
2634                 case IBT_SRATE_10:      /*  4X SDR or 1X QDR i.e 10 Gbps */
2635                         ret = 4;
2636                         break;
2637                 case IBT_SRATE_30:      /* 12X SDR i.e 30 Gbps */
2638                         ret = 12;
2639                         break;
2640                 case IBT_SRATE_5:       /*  1X DDR i.e  5 Gbps */
2641                         ret = 2;
2642                         break;
2643                 case IBT_SRATE_20:      /*  4X DDR or 8X SDR i.e 20 Gbps */
2644                         ret = 8;
2645                         break;
2646                 case IBT_SRATE_40:      /*  8X DDR or 4X QDR i.e 40 Gbps */
2647                         ret = 16;
2648                         break;
2649                 case IBT_SRATE_60:      /* 12X DDR i.e 60 Gbps */
2650                         ret = 24;
2651                         break;
2652                 case IBT_SRATE_80:      /*  8X QDR i.e 80 Gbps */
2653                         ret = 32;
2654                         break;
2655                 case IBT_SRATE_120:     /* 12X QDR i.e 120 Gbps */
2656                         ret = 48;
2657                         break;
2658         }
2659 
2660         ifspeed *= ret;
2661 
2662 earlydone:
2663         return (ifspeed);
2664 }
2665 
2666 /*
2667  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2668  * representing the input mcg mgid.
2669  */
2670 static ibd_mce_t *
2671 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2672 {
2673         ibd_mce_t *ptr = list_head(mlist);
2674 
2675         /*
2676          * Do plain linear search.
2677          */
2678         while (ptr != NULL) {
2679                 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2680                     sizeof (ib_gid_t)) == 0)
2681                         return (ptr);
2682                 ptr = list_next(mlist, ptr);
2683         }
2684         return (NULL);
2685 }
2686 
2687 /*
2688  * Execute IBA JOIN.
2689  */
2690 static ibt_status_t
2691 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2692 {
2693         ibt_mcg_attr_t mcg_attr;
2694 
2695         bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2696         mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2697         mcg_attr.mc_mgid = mgid;
2698         mcg_attr.mc_join_state = mce->mc_jstate;
2699         mcg_attr.mc_scope = state->id_scope;
2700         mcg_attr.mc_pkey = state->id_pkey;
2701         mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2702         mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2703         mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2704         return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2705             NULL, NULL));
2706 }
2707 
2708 /*
2709  * This code JOINs the port in the proper way (depending on the join
2710  * state) so that IBA fabric will forward mcg packets to/from the port.
2711  * It also attaches the QPN to the mcg so it can receive those mcg
2712  * packets. This code makes sure not to attach the mcg to the QP if
2713  * that has been previously done due to the mcg being joined with a
2714  * different join state, even though this is not required by SWG_0216,
2715  * refid 3610.
2716  */
2717 static ibd_mce_t *
2718 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2719 {
2720         ibt_status_t ibt_status;
2721         ibd_mce_t *mce, *tmce, *omce = NULL;
2722         boolean_t do_attach = B_TRUE;
2723 
2724         DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2725             jstate, mgid.gid_prefix, mgid.gid_guid);
2726 
2727         /*
2728          * For enable_multicast Full member joins, we need to do some
2729          * extra work. If there is already an mce on the list that
2730          * indicates full membership, that means the membership has
2731          * not yet been dropped (since the disable_multicast was issued)
2732          * because there are pending Tx's to the mcg; in that case, just
2733          * mark the mce not to be reaped when the Tx completion queues
2734          * an async reap operation.
2735          *
2736          * If there is already an mce on the list indicating sendonly
2737          * membership, try to promote to full membership. Be careful
2738          * not to deallocate the old mce, since there might be an AH
2739          * pointing to it; instead, update the old mce with new data
2740          * that tracks the full membership.
2741          */
2742         if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2743             IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2744                 if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2745                         ASSERT(omce->mc_fullreap);
2746                         omce->mc_fullreap = B_FALSE;
2747                         return (omce);
2748                 } else {
2749                         ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2750                 }
2751         }
2752 
2753         /*
2754          * Allocate the ibd_mce_t to track this JOIN.
2755          */
2756         mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2757         mce->mc_fullreap = B_FALSE;
2758         mce->mc_jstate = jstate;
2759 
2760         if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2761                 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2762                     ibt_status);
2763                 kmem_free(mce, sizeof (ibd_mce_t));
2764                 return (NULL);
2765         }
2766 
2767         /*
2768          * Is an IBA attach required? Not if the interface is already joined
2769          * to the mcg in a different appropriate join state.
2770          */
2771         if (jstate == IB_MC_JSTATE_NON) {
2772                 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2773                 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2774                         do_attach = B_FALSE;
2775         } else if (jstate == IB_MC_JSTATE_FULL) {
2776                 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2777                         do_attach = B_FALSE;
2778         } else {        /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2779                 do_attach = B_FALSE;
2780         }
2781 
2782         if (do_attach) {
2783                 /*
2784                  * Do the IBA attach.
2785                  */
2786                 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2787                 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2788                     &mce->mc_info)) != IBT_SUCCESS) {
2789                         DPRINT(10, "ibd_join_group : failed qp attachment "
2790                             "%d\n", ibt_status);
2791                         /*
2792                          * NOTE that we should probably preserve the join info
2793                          * in the list and later try to leave again at detach
2794                          * time.
2795                          */
2796                         (void) ibt_leave_mcg(state->id_sgid, mgid,
2797                             state->id_sgid, jstate);
2798                         kmem_free(mce, sizeof (ibd_mce_t));
2799                         return (NULL);
2800                 }
2801         }
2802 
2803         /*
2804          * Insert the ibd_mce_t in the proper list.
2805          */
2806         if (jstate == IB_MC_JSTATE_NON) {
2807                 IBD_MCACHE_INSERT_NON(state, mce);
2808         } else {
2809                 /*
2810                  * Set up the mc_req fields used for reaping the
2811                  * mcg in case of delayed tx completion (see
2812                  * ibd_tx_cleanup()). Also done for sendonly join in
2813                  * case we are promoted to fullmembership later and
2814                  * keep using the same mce.
2815                  */
2816                 mce->mc_req.rq_gid = mgid;
2817                 mce->mc_req.rq_ptr = mce;
2818                 /*
2819                  * Check whether this is the case of trying to join
2820                  * full member, and we were already joined send only.
2821                  * We try to drop our SendOnly membership, but it is
2822                  * possible that the mcg does not exist anymore (and
2823                  * the subnet trap never reached us), so the leave
2824                  * operation might fail.
2825                  */
2826                 if (omce != NULL) {
2827                         (void) ibt_leave_mcg(state->id_sgid, mgid,
2828                             state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2829                         omce->mc_jstate = IB_MC_JSTATE_FULL;
2830                         bcopy(&mce->mc_info, &omce->mc_info,
2831                             sizeof (ibt_mcg_info_t));
2832                         kmem_free(mce, sizeof (ibd_mce_t));
2833                         return (omce);
2834                 }
2835                 mutex_enter(&state->id_mc_mutex);
2836                 IBD_MCACHE_INSERT_FULL(state, mce);
2837                 mutex_exit(&state->id_mc_mutex);
2838         }
2839 
2840         return (mce);
2841 }
2842 
2843 /*
2844  * Called during port up event handling to attempt to reacquire full
2845  * membership to an mcg. Stripped down version of ibd_join_group().
2846  * Note that it is possible that the mcg might have gone away, and
2847  * gets recreated at this point.
2848  */
2849 static void
2850 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2851 {
2852         ib_gid_t mgid;
2853 
2854         /*
2855          * If the mc_fullreap flag is set, or this join fails, a subsequent
2856          * reap/leave is going to try to leave the group. We could prevent
2857          * that by adding a boolean flag into ibd_mce_t, if required.
2858          */
2859         if (mce->mc_fullreap)
2860                 return;
2861 
2862         mgid = mce->mc_info.mc_adds_vect.av_dgid;
2863 
2864         DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2865             mgid.gid_guid);
2866 
2867         /* While reacquiring, leave and then join the MCG */
2868         (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
2869             mce->mc_jstate);
2870         if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2871                 ibd_print_warn(state, "Failure on port up to rejoin "
2872                     "multicast gid %016llx:%016llx",
2873                     (u_longlong_t)mgid.gid_prefix,
2874                     (u_longlong_t)mgid.gid_guid);
2875 }
2876 
2877 /*
2878  * This code handles delayed Tx completion cleanups for mcg's to which
2879  * disable_multicast has been issued, regular mcg related cleanups during
2880  * disable_multicast, disable_promiscuous and mcg traps, as well as
2881  * cleanups during driver detach time. Depending on the join state,
2882  * it deletes the mce from the appropriate list and issues the IBA
2883  * leave/detach; except in the disable_multicast case when the mce
2884  * is left on the active list for a subsequent Tx completion cleanup.
2885  */
2886 static void
2887 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2888     uint8_t jstate)
2889 {
2890         ibd_mce_t *tmce;
2891         boolean_t do_detach = B_TRUE;
2892 
2893         /*
2894          * Before detaching, we must check whether the other list
2895          * contains the mcg; if we detach blindly, the consumer
2896          * who set up the other list will also stop receiving
2897          * traffic.
2898          */
2899         if (jstate == IB_MC_JSTATE_FULL) {
2900                 /*
2901                  * The following check is only relevant while coming
2902                  * from the Tx completion path in the reap case.
2903                  */
2904                 if (!mce->mc_fullreap)
2905                         return;
2906                 mutex_enter(&state->id_mc_mutex);
2907                 IBD_MCACHE_PULLOUT_FULL(state, mce);
2908                 mutex_exit(&state->id_mc_mutex);
2909                 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2910                         do_detach = B_FALSE;
2911         } else if (jstate == IB_MC_JSTATE_NON) {
2912                 IBD_MCACHE_PULLOUT_NON(state, mce);
2913                 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2914                 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2915                         do_detach = B_FALSE;
2916         } else {        /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2917                 mutex_enter(&state->id_mc_mutex);
2918                 IBD_MCACHE_PULLOUT_FULL(state, mce);
2919                 mutex_exit(&state->id_mc_mutex);
2920                 do_detach = B_FALSE;
2921         }
2922 
2923         /*
2924          * If we are reacting to a mcg trap and leaving our sendonly or
2925          * non membership, the mcg is possibly already gone, so attempting
2926          * to leave might fail. On the other hand, we must try to leave
2927          * anyway, since this might be a trap from long ago, and we could
2928          * have potentially sendonly joined to a recent incarnation of
2929          * the mcg and are about to loose track of this information.
2930          */
2931         if (do_detach) {
2932                 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
2933                     "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2934                 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
2935         }
2936 
2937         (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
2938         kmem_free(mce, sizeof (ibd_mce_t));
2939 }
2940 
2941 /*
2942  * Async code executed due to multicast and promiscuous disable requests
2943  * and mcg trap handling; also executed during driver detach. Mostly, a
2944  * leave and detach is done; except for the fullmember case when Tx
2945  * requests are pending, whence arrangements are made for subsequent
2946  * cleanup on Tx completion.
2947  */
2948 static void
2949 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2950 {
2951         ipoib_mac_t mcmac;
2952         boolean_t recycled;
2953         ibd_mce_t *mce;
2954 
2955         DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
2956             jstate, mgid.gid_prefix, mgid.gid_guid);
2957 
2958         if (jstate == IB_MC_JSTATE_NON) {
2959                 recycled = B_TRUE;
2960                 mce = IBD_MCACHE_FIND_NON(state, mgid);
2961                 /*
2962                  * In case we are handling a mcg trap, we might not find
2963                  * the mcg in the non list.
2964                  */
2965                 if (mce == NULL) {
2966                         return;
2967                 }
2968         } else {
2969                 mce = IBD_MCACHE_FIND_FULL(state, mgid);
2970 
2971                 /*
2972                  * In case we are handling a mcg trap, make sure the trap
2973                  * is not arriving late; if we have an mce that indicates
2974                  * that we are already a fullmember, that would be a clear
2975                  * indication that the trap arrived late (ie, is for a
2976                  * previous incarnation of the mcg).
2977                  */
2978                 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
2979                         if ((mce == NULL) || (mce->mc_jstate ==
2980                             IB_MC_JSTATE_FULL)) {
2981                                 return;
2982                         }
2983                 } else {
2984                         ASSERT(jstate == IB_MC_JSTATE_FULL);
2985 
2986                         /*
2987                          * If join group failed, mce will be NULL here.
2988                          * This is because in GLDv3 driver, set multicast
2989                          *  will always return success.
2990                          */
2991                         if (mce == NULL) {
2992                                 return;
2993                         }
2994 
2995                         mce->mc_fullreap = B_TRUE;
2996                 }
2997 
2998                 /*
2999                  * If no pending Tx's remain that reference the AH
3000                  * for the mcg, recycle it from active to free list.
3001                  * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3002                  * so the last completing Tx will cause an async reap
3003                  * operation to be invoked, at which time we will drop our
3004                  * membership to the mcg so that the pending Tx's complete
3005                  * successfully. Refer to comments on "AH and MCE active
3006                  * list manipulation" at top of this file. The lock protects
3007                  * against Tx fast path and Tx cleanup code.
3008                  */
3009                 mutex_enter(&state->id_ac_mutex);
3010                 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3011                 recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3012                     IB_MC_JSTATE_SEND_ONLY_NON));
3013                 mutex_exit(&state->id_ac_mutex);
3014         }
3015 
3016         if (recycled) {
3017                 DPRINT(2, "ibd_leave_group : leave_group reaping : "
3018                     "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3019                 ibd_async_reap_group(state, mce, mgid, jstate);
3020         }
3021 }
3022 
3023 /*
3024  * Find the broadcast address as defined by IPoIB; implicitly
3025  * determines the IBA scope, mtu, tclass etc of the link the
3026  * interface is going to be a member of.
3027  */
3028 static ibt_status_t
3029 ibd_find_bgroup(ibd_state_t *state)
3030 {
3031         ibt_mcg_attr_t mcg_attr;
3032         uint_t numg;
3033         uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3034             IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3035             IB_MC_SCOPE_GLOBAL };
3036         int i, mcgmtu;
3037         boolean_t found = B_FALSE;
3038         int ret;
3039         ibt_mcg_info_t mcg_info;
3040 
3041         state->id_bgroup_created = B_FALSE;
3042         state->id_bgroup_present = B_FALSE;
3043 
3044 query_bcast_grp:
3045         bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3046         mcg_attr.mc_pkey = state->id_pkey;
3047         state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3048 
3049         for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3050                 state->id_scope = mcg_attr.mc_scope = scopes[i];
3051 
3052                 /*
3053                  * Look for the IPoIB broadcast group.
3054                  */
3055                 state->id_mgid.gid_prefix =
3056                     (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3057                     ((uint64_t)state->id_scope << 48) |
3058                     ((uint32_t)(state->id_pkey << 16)));
3059                 mcg_attr.mc_mgid = state->id_mgid;
3060                 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3061                     &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3062                         found = B_TRUE;
3063                         break;
3064                 }
3065         }
3066 
3067         if (!found) {
3068                 if (state->id_create_broadcast_group) {
3069                         /*
3070                          * If we created the broadcast group, but failed to
3071                          * find it, we can't do anything except leave the
3072                          * one we created and return failure.
3073                          */
3074                         if (state->id_bgroup_created) {
3075                                 ibd_print_warn(state, "IPoIB broadcast group "
3076                                     "absent. Unable to query after create.");
3077                                 goto find_bgroup_fail;
3078                         }
3079 
3080                         /*
3081                          * Create the ipoib broadcast group if it didn't exist
3082                          */
3083                         bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3084                         mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3085                         mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3086                         mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3087                         mcg_attr.mc_pkey = state->id_pkey;
3088                         mcg_attr.mc_flow = 0;
3089                         mcg_attr.mc_sl = 0;
3090                         mcg_attr.mc_tclass = 0;
3091                         state->id_mgid.gid_prefix =
3092                             (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3093                             ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3094                             ((uint32_t)(state->id_pkey << 16)));
3095                         mcg_attr.mc_mgid = state->id_mgid;
3096 
3097                         if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3098                             &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3099                                 ibd_print_warn(state, "IPoIB broadcast group "
3100                                     "absent, create failed: ret = %d\n", ret);
3101                                 state->id_bgroup_created = B_FALSE;
3102                                 return (IBT_FAILURE);
3103                         }
3104                         state->id_bgroup_created = B_TRUE;
3105                         goto query_bcast_grp;
3106                 } else {
3107                         ibd_print_warn(state, "IPoIB broadcast group absent");
3108                         return (IBT_FAILURE);
3109                 }
3110         }
3111 
3112         /*
3113          * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3114          */
3115         mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3116         if (state->id_mtu < mcgmtu) {
3117                 ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3118                     "greater than port's maximum MTU %d", mcgmtu,
3119                     state->id_mtu);
3120                 ibt_free_mcg_info(state->id_mcinfo, 1);
3121                 goto find_bgroup_fail;
3122         }
3123         state->id_mtu = mcgmtu;
3124         state->id_bgroup_present = B_TRUE;
3125 
3126         return (IBT_SUCCESS);
3127 
3128 find_bgroup_fail:
3129         if (state->id_bgroup_created) {
3130                 (void) ibt_leave_mcg(state->id_sgid,
3131                     mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3132                     IB_MC_JSTATE_FULL);
3133         }
3134 
3135         return (IBT_FAILURE);
3136 }
3137 
3138 static int
3139 ibd_alloc_tx_copybufs(ibd_state_t *state)
3140 {
3141         ibt_mr_attr_t mem_attr;
3142 
3143         /*
3144          * Allocate one big chunk for all regular tx copy bufs
3145          */
3146         state->id_tx_buf_sz = state->id_mtu;
3147         if (state->id_lso_policy && state->id_lso_capable &&
3148             (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3149                 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3150         }
3151 
3152         state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3153             state->id_tx_buf_sz, KM_SLEEP);
3154 
3155         state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3156             sizeof (ibd_swqe_t), KM_SLEEP);
3157 
3158         /*
3159          * Do one memory registration on the entire txbuf area
3160          */
3161         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3162         mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3163         mem_attr.mr_as = NULL;
3164         mem_attr.mr_flags = IBT_MR_SLEEP;
3165         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3166             &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3167                 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3168                 kmem_free(state->id_tx_wqes,
3169                     state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3170                 kmem_free(state->id_tx_bufs,
3171                     state->id_ud_num_swqe * state->id_tx_buf_sz);
3172                 state->id_tx_bufs = NULL;
3173                 return (DDI_FAILURE);
3174         }
3175 
3176         return (DDI_SUCCESS);
3177 }
3178 
3179 static int
3180 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3181 {
3182         ibt_mr_attr_t mem_attr;
3183         ibd_lsobuf_t *buflist;
3184         ibd_lsobuf_t *lbufp;
3185         ibd_lsobuf_t *tail;
3186         ibd_lsobkt_t *bktp;
3187         uint8_t *membase;
3188         uint8_t *memp;
3189         uint_t memsz;
3190         int i;
3191 
3192         /*
3193          * Allocate the lso bucket
3194          */
3195         bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3196 
3197         /*
3198          * Allocate the entire lso memory and register it
3199          */
3200         memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3201         membase = kmem_zalloc(memsz, KM_SLEEP);
3202 
3203         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3204         mem_attr.mr_len = memsz;
3205         mem_attr.mr_as = NULL;
3206         mem_attr.mr_flags = IBT_MR_SLEEP;
3207         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3208             &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3209                 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3210                 kmem_free(membase, memsz);
3211                 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3212                 return (DDI_FAILURE);
3213         }
3214 
3215         mutex_enter(&state->id_lso_lock);
3216 
3217         /*
3218          * Now allocate the buflist.  Note that the elements in the buflist and
3219          * the buffers in the lso memory have a permanent 1-1 relation, so we
3220          * can always derive the address of a buflist entry from the address of
3221          * an lso buffer.
3222          */
3223         buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3224             KM_SLEEP);
3225 
3226         /*
3227          * Set up the lso buf chain
3228          */
3229         memp = membase;
3230         lbufp = buflist;
3231         for (i = 0; i < state->id_num_lso_bufs; i++) {
3232                 lbufp->lb_isfree = 1;
3233                 lbufp->lb_buf = memp;
3234                 lbufp->lb_next = lbufp + 1;
3235 
3236                 tail = lbufp;
3237 
3238                 memp += IBD_LSO_BUFSZ;
3239                 lbufp++;
3240         }
3241         tail->lb_next = NULL;
3242 
3243         /*
3244          * Set up the LSO buffer information in ibd state
3245          */
3246         bktp->bkt_bufl = buflist;
3247         bktp->bkt_free_head = buflist;
3248         bktp->bkt_mem = membase;
3249         bktp->bkt_nelem = state->id_num_lso_bufs;
3250         bktp->bkt_nfree = bktp->bkt_nelem;
3251 
3252         state->id_lso = bktp;
3253         mutex_exit(&state->id_lso_lock);
3254 
3255         return (DDI_SUCCESS);
3256 }
3257 
3258 /*
3259  * Statically allocate Tx buffer list(s).
3260  */
3261 static int
3262 ibd_init_txlist(ibd_state_t *state)
3263 {
3264         ibd_swqe_t *swqe;
3265         ibt_lkey_t lkey;
3266         int i;
3267         uint_t len;
3268         uint8_t *bufaddr;
3269 
3270         if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3271                 return (DDI_FAILURE);
3272 
3273         if (state->id_lso_policy && state->id_lso_capable) {
3274                 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3275                         state->id_lso_capable = B_FALSE;
3276         }
3277 
3278         mutex_enter(&state->id_tx_list.dl_mutex);
3279         state->id_tx_list.dl_head = NULL;
3280         state->id_tx_list.dl_pending_sends = B_FALSE;
3281         state->id_tx_list.dl_cnt = 0;
3282         mutex_exit(&state->id_tx_list.dl_mutex);
3283         mutex_enter(&state->id_tx_rel_list.dl_mutex);
3284         state->id_tx_rel_list.dl_head = NULL;
3285         state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3286         state->id_tx_rel_list.dl_cnt = 0;
3287         mutex_exit(&state->id_tx_rel_list.dl_mutex);
3288 
3289         /*
3290          * Allocate and setup the swqe list
3291          */
3292         lkey = state->id_tx_mr_desc.md_lkey;
3293         bufaddr = state->id_tx_bufs;
3294         len = state->id_tx_buf_sz;
3295         swqe = state->id_tx_wqes;
3296         mutex_enter(&state->id_tx_list.dl_mutex);
3297         for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3298                 swqe->swqe_next = NULL;
3299                 swqe->swqe_im_mblk = NULL;
3300 
3301                 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3302                     bufaddr;
3303                 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3304                 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3305 
3306                 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3307                 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3308                 swqe->w_swr.wr_trans = IBT_UD_SRV;
3309 
3310                 /* These are set in send */
3311                 swqe->w_swr.wr_nds = 0;
3312                 swqe->w_swr.wr_sgl = NULL;
3313                 swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3314 
3315                 /* add to list */
3316                 state->id_tx_list.dl_cnt++;
3317                 swqe->swqe_next = state->id_tx_list.dl_head;
3318                 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3319         }
3320         mutex_exit(&state->id_tx_list.dl_mutex);
3321 
3322         return (DDI_SUCCESS);
3323 }
3324 
3325 static int
3326 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3327     uint32_t *nds_p)
3328 {
3329         ibd_lsobkt_t *bktp;
3330         ibd_lsobuf_t *lbufp;
3331         ibd_lsobuf_t *nextp;
3332         ibt_lkey_t lso_lkey;
3333         uint_t frag_sz;
3334         uint_t num_needed;
3335         int i;
3336 
3337         ASSERT(sgl_p != NULL);
3338         ASSERT(nds_p != NULL);
3339         ASSERT(req_sz != 0);
3340 
3341         /*
3342          * Determine how many bufs we'd need for the size requested
3343          */
3344         num_needed = req_sz / IBD_LSO_BUFSZ;
3345         if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3346                 num_needed++;
3347 
3348         mutex_enter(&state->id_lso_lock);
3349 
3350         /*
3351          * If we don't have enough lso bufs, return failure
3352          */
3353         ASSERT(state->id_lso != NULL);
3354         bktp = state->id_lso;
3355         if (bktp->bkt_nfree < num_needed) {
3356                 mutex_exit(&state->id_lso_lock);
3357                 return (-1);
3358         }
3359 
3360         /*
3361          * Pick the first 'num_needed' bufs from the free list
3362          */
3363         lso_lkey = bktp->bkt_mr_desc.md_lkey;
3364         lbufp = bktp->bkt_free_head;
3365         for (i = 0; i < num_needed; i++) {
3366                 ASSERT(lbufp->lb_isfree != 0);
3367                 ASSERT(lbufp->lb_buf != NULL);
3368 
3369                 nextp = lbufp->lb_next;
3370 
3371                 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3372                 sgl_p[i].ds_key = lso_lkey;
3373                 sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3374 
3375                 lbufp->lb_isfree = 0;
3376                 lbufp->lb_next = NULL;
3377 
3378                 lbufp = nextp;
3379         }
3380         bktp->bkt_free_head = lbufp;
3381 
3382         /*
3383          * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3384          * to adjust the last sgl entry's length. Since we know we need atleast
3385          * one, the i-1 use below is ok.
3386          */
3387         if (frag_sz) {
3388                 sgl_p[i-1].ds_len = frag_sz;
3389         }
3390 
3391         /*
3392          * Update nfree count and return
3393          */
3394         bktp->bkt_nfree -= num_needed;
3395 
3396         mutex_exit(&state->id_lso_lock);
3397 
3398         *nds_p = num_needed;
3399 
3400         return (0);
3401 }
3402 
3403 static void
3404 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3405 {
3406         ibd_lsobkt_t *bktp;
3407         ibd_lsobuf_t *lbufp;
3408         uint8_t *lso_mem_end;
3409         uint_t ndx;
3410         int i;
3411 
3412         mutex_enter(&state->id_lso_lock);
3413 
3414         bktp = state->id_lso;
3415         ASSERT(bktp != NULL);
3416 
3417         lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3418         for (i = 0; i < nds; i++) {
3419                 uint8_t *va;
3420 
3421                 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3422                 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3423 
3424                 /*
3425                  * Figure out the buflist element this sgl buffer corresponds
3426                  * to and put it back at the head
3427                  */
3428                 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3429                 lbufp = bktp->bkt_bufl + ndx;
3430 
3431                 ASSERT(lbufp->lb_isfree == 0);
3432                 ASSERT(lbufp->lb_buf == va);
3433 
3434                 lbufp->lb_isfree = 1;
3435                 lbufp->lb_next = bktp->bkt_free_head;
3436                 bktp->bkt_free_head = lbufp;
3437         }
3438         bktp->bkt_nfree += nds;
3439 
3440         mutex_exit(&state->id_lso_lock);
3441 }
3442 
3443 static void
3444 ibd_free_tx_copybufs(ibd_state_t *state)
3445 {
3446         /*
3447          * Unregister txbuf mr
3448          */
3449         if (ibt_deregister_mr(state->id_hca_hdl,
3450             state->id_tx_mr_hdl) != IBT_SUCCESS) {
3451                 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3452         }
3453         state->id_tx_mr_hdl = NULL;
3454 
3455         /*
3456          * Free txbuf memory
3457          */
3458         kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3459             sizeof (ibd_swqe_t));
3460         kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3461             state->id_tx_buf_sz);
3462         state->id_tx_wqes = NULL;
3463         state->id_tx_bufs = NULL;
3464 }
3465 
3466 static void
3467 ibd_free_tx_lsobufs(ibd_state_t *state)
3468 {
3469         ibd_lsobkt_t *bktp;
3470 
3471         mutex_enter(&state->id_lso_lock);
3472 
3473         if ((bktp = state->id_lso) == NULL) {
3474                 mutex_exit(&state->id_lso_lock);
3475                 return;
3476         }
3477 
3478         /*
3479          * First, free the buflist
3480          */
3481         ASSERT(bktp->bkt_bufl != NULL);
3482         kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3483 
3484         /*
3485          * Unregister the LSO memory and free it
3486          */
3487         ASSERT(bktp->bkt_mr_hdl != NULL);
3488         if (ibt_deregister_mr(state->id_hca_hdl,
3489             bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3490                 DPRINT(10,
3491                     "ibd_free_lsobufs: ibt_deregister_mr failed");
3492         }
3493         ASSERT(bktp->bkt_mem);
3494         kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3495 
3496         /*
3497          * Finally free the bucket
3498          */
3499         kmem_free(bktp, sizeof (ibd_lsobkt_t));
3500         state->id_lso = NULL;
3501 
3502         mutex_exit(&state->id_lso_lock);
3503 }
3504 
3505 /*
3506  * Free the statically allocated Tx buffer list.
3507  */
3508 static void
3509 ibd_fini_txlist(ibd_state_t *state)
3510 {
3511         /*
3512          * Free the allocated swqes
3513          */
3514         mutex_enter(&state->id_tx_list.dl_mutex);
3515         mutex_enter(&state->id_tx_rel_list.dl_mutex);
3516         state->id_tx_list.dl_head = NULL;
3517         state->id_tx_list.dl_pending_sends = B_FALSE;
3518         state->id_tx_list.dl_cnt = 0;
3519         state->id_tx_rel_list.dl_head = NULL;
3520         state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3521         state->id_tx_rel_list.dl_cnt = 0;
3522         mutex_exit(&state->id_tx_rel_list.dl_mutex);
3523         mutex_exit(&state->id_tx_list.dl_mutex);
3524 
3525         ibd_free_tx_lsobufs(state);
3526         ibd_free_tx_copybufs(state);
3527 }
3528 
3529 /*
3530  * post a list of rwqes, NULL terminated.
3531  */
3532 static void
3533 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3534 {
3535         uint_t          i;
3536         uint_t          num_posted;
3537         ibt_status_t    ibt_status;
3538         ibt_recv_wr_t   wrs[IBD_RX_POST_CNT];
3539 
3540         while (rwqe) {
3541                 /* Post up to IBD_RX_POST_CNT receive work requests */
3542                 for (i = 0; i < IBD_RX_POST_CNT; i++) {
3543                         wrs[i] = rwqe->w_rwr;
3544                         rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3545                         if (rwqe == NULL) {
3546                                 i++;
3547                                 break;
3548                         }
3549                 }
3550 
3551                 /*
3552                  * If posting fails for some reason, we'll never receive
3553                  * completion intimation, so we'll need to cleanup. But
3554                  * we need to make sure we don't clean up nodes whose
3555                  * wrs have been successfully posted. We assume that the
3556                  * hca driver returns on the first failure to post and
3557                  * therefore the first 'num_posted' entries don't need
3558                  * cleanup here.
3559                  */
3560                 atomic_add_32(&state->id_rx_list.dl_cnt, i);
3561 
3562                 num_posted = 0;
3563                 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3564                     &num_posted);
3565                 if (ibt_status != IBT_SUCCESS) {
3566                         /* This cannot happen unless the device has an error. */
3567                         ibd_print_warn(state, "ibd_post_recv: FATAL: "
3568                             "posting multiple wrs failed: "
3569                             "requested=%d, done=%d, ret=%d",
3570                             IBD_RX_POST_CNT, num_posted, ibt_status);
3571                         atomic_add_32(&state->id_rx_list.dl_cnt,
3572                             num_posted - i);
3573                 }
3574         }
3575 }
3576 
3577 /*
3578  * Grab a list of rwqes from the array of lists, and post the list.
3579  */
3580 static void
3581 ibd_post_recv_intr(ibd_state_t *state)
3582 {
3583         ibd_rx_queue_t  *rxp;
3584         ibd_rwqe_t *list;
3585 
3586         /* rotate through the rx_queue array, expecting an adequate number */
3587         state->id_rx_post_queue_index =
3588             (state->id_rx_post_queue_index + 1) &
3589             (state->id_rx_nqueues - 1);
3590 
3591         rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3592         mutex_enter(&rxp->rx_post_lock);
3593         list = WQE_TO_RWQE(rxp->rx_head);
3594         rxp->rx_head = NULL;
3595         rxp->rx_cnt = 0;
3596         mutex_exit(&rxp->rx_post_lock);
3597         ibd_post_recv_list(state, list);
3598 }
3599 
3600 /* macro explained below */
3601 #define RX_QUEUE_HASH(rwqe) \
3602         (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3603 
3604 /*
3605  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3606  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3607  *
3608  * Note: one of 2^N lists is chosen via a hash.  This is done
3609  * because using one list is contentious.  If the first list is busy
3610  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3611  *
3612  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3613  * even distribution of mapping rwqes to the 2^N queues.
3614  */
3615 static void
3616 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3617 {
3618         ibd_rx_queue_t  *rxp;
3619 
3620         rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3621 
3622         if (!mutex_tryenter(&rxp->rx_post_lock)) {
3623                 /* Failed.  Try a different queue ("ptr + 16" ensures that). */
3624                 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3625                 mutex_enter(&rxp->rx_post_lock);
3626         }
3627         rwqe->rwqe_next = rxp->rx_head;
3628         if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
3629                 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
3630 
3631                 /* only call ibt_post_recv() every Nth time through here */
3632                 if ((active & (state->id_rx_nqueues - 1)) == 0) {
3633                         rxp->rx_head = NULL;
3634                         rxp->rx_cnt = 0;
3635                         mutex_exit(&rxp->rx_post_lock);
3636                         ibd_post_recv_list(state, rwqe);
3637                         return;
3638                 }
3639         }
3640         rxp->rx_head = RWQE_TO_WQE(rwqe);
3641         mutex_exit(&rxp->rx_post_lock);
3642 }
3643 
3644 static int
3645 ibd_alloc_rx_copybufs(ibd_state_t *state)
3646 {
3647         ibt_mr_attr_t mem_attr;
3648         int i;
3649 
3650         /*
3651          * Allocate one big chunk for all regular rx copy bufs
3652          */
3653         state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
3654 
3655         state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
3656             state->id_rx_buf_sz, KM_SLEEP);
3657 
3658         state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
3659             sizeof (ibd_rwqe_t), KM_SLEEP);
3660 
3661         state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
3662         state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
3663             sizeof (ibd_rx_queue_t), KM_SLEEP);
3664         for (i = 0; i < state->id_rx_nqueues; i++) {
3665                 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3666                 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
3667         }
3668 
3669         /*
3670          * Do one memory registration on the entire rxbuf area
3671          */
3672         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
3673         mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
3674         mem_attr.mr_as = NULL;
3675         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3676         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3677             &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
3678                 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
3679                 kmem_free(state->id_rx_wqes,
3680                     state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
3681                 kmem_free(state->id_rx_bufs,
3682                     state->id_ud_num_rwqe * state->id_rx_buf_sz);
3683                 state->id_rx_bufs = NULL;
3684                 state->id_rx_wqes = NULL;
3685                 return (DDI_FAILURE);
3686         }
3687 
3688         return (DDI_SUCCESS);
3689 }
3690 
3691 /*
3692  * Allocate the statically allocated Rx buffer list.
3693  */
3694 static int
3695 ibd_init_rxlist(ibd_state_t *state)
3696 {
3697         ibd_rwqe_t *rwqe, *next;
3698         ibd_wqe_t *list;
3699         ibt_lkey_t lkey;
3700         int i;
3701         uint_t len;
3702         uint8_t *bufaddr;
3703 
3704         mutex_enter(&state->id_rx_free_list.dl_mutex);
3705         if (state->id_rx_free_list.dl_head != NULL) {
3706                 /* rx rsrcs were never freed.  Just repost them */
3707                 len = state->id_rx_buf_sz;
3708                 list = state->id_rx_free_list.dl_head;
3709                 state->id_rx_free_list.dl_head = NULL;
3710                 state->id_rx_free_list.dl_cnt = 0;
3711                 mutex_exit(&state->id_rx_free_list.dl_mutex);
3712                 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3713                     rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3714                         if ((rwqe->rwqe_im_mblk = desballoc(
3715                             rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
3716                             &rwqe->w_freemsg_cb)) == NULL) {
3717                                 /* allow freemsg_cb to free the rwqes */
3718                                 if (atomic_dec_32_nv(&state->id_running) != 0) {
3719                                         cmn_err(CE_WARN, "ibd_init_rxlist: "
3720                                             "id_running was not 1\n");
3721                                 }
3722                                 DPRINT(10, "ibd_init_rxlist : "
3723                                     "failed in desballoc()");
3724                                 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3725                                     rwqe = next) {
3726                                         next = WQE_TO_RWQE(rwqe->rwqe_next);
3727                                         if (rwqe->rwqe_im_mblk) {
3728                                                 atomic_inc_32(&state->
3729                                                     id_rx_list.
3730                                                     dl_bufs_outstanding);
3731                                                 freemsg(rwqe->rwqe_im_mblk);
3732                                         } else
3733                                                 ibd_free_rwqe(state, rwqe);
3734                                 }
3735                                 atomic_inc_32(&state->id_running);
3736                                 return (DDI_FAILURE);
3737                         }
3738                 }
3739                 ibd_post_recv_list(state, WQE_TO_RWQE(list));
3740                 return (DDI_SUCCESS);
3741         }
3742         mutex_exit(&state->id_rx_free_list.dl_mutex);
3743 
3744         if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
3745                 return (DDI_FAILURE);
3746 
3747         /*
3748          * Allocate and setup the rwqe list
3749          */
3750         len = state->id_rx_buf_sz;
3751         lkey = state->id_rx_mr_desc.md_lkey;
3752         rwqe = state->id_rx_wqes;
3753         bufaddr = state->id_rx_bufs;
3754         list = NULL;
3755         for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
3756                 rwqe->w_state = state;
3757                 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3758                 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3759 
3760                 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
3761 
3762                 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
3763                     &rwqe->w_freemsg_cb)) == NULL) {
3764                         DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
3765                         /* allow freemsg_cb to free the rwqes */
3766                         if (atomic_dec_32_nv(&state->id_running) != 0) {
3767                                 cmn_err(CE_WARN, "ibd_init_rxlist: "
3768                                     "id_running was not 1\n");
3769                         }
3770                         DPRINT(10, "ibd_init_rxlist : "
3771                             "failed in desballoc()");
3772                         for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3773                             rwqe = next) {
3774                                 next = WQE_TO_RWQE(rwqe->rwqe_next);
3775                                 freemsg(rwqe->rwqe_im_mblk);
3776                         }
3777                         atomic_inc_32(&state->id_running);
3778 
3779                         /* remove reference to free'd rwqes */
3780                         mutex_enter(&state->id_rx_free_list.dl_mutex);
3781                         state->id_rx_free_list.dl_head = NULL;
3782                         state->id_rx_free_list.dl_cnt = 0;
3783                         mutex_exit(&state->id_rx_free_list.dl_mutex);
3784 
3785                         ibd_fini_rxlist(state);
3786                         return (DDI_FAILURE);
3787                 }
3788 
3789                 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
3790                 rwqe->rwqe_copybuf.ic_sgl.ds_va =
3791                     (ib_vaddr_t)(uintptr_t)bufaddr;
3792                 rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
3793                 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3794                 rwqe->w_rwr.wr_nds = 1;
3795                 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3796 
3797                 rwqe->rwqe_next = list;
3798                 list = RWQE_TO_WQE(rwqe);
3799         }
3800         ibd_post_recv_list(state, WQE_TO_RWQE(list));
3801 
3802         return (DDI_SUCCESS);
3803 }
3804 
3805 static void
3806 ibd_free_rx_copybufs(ibd_state_t *state)
3807 {
3808         int i;
3809 
3810         /*
3811          * Unregister rxbuf mr
3812          */
3813         if (ibt_deregister_mr(state->id_hca_hdl,
3814             state->id_rx_mr_hdl) != IBT_SUCCESS) {
3815                 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
3816         }
3817         state->id_rx_mr_hdl = NULL;
3818 
3819         /*
3820          * Free rxbuf memory
3821          */
3822         for (i = 0; i < state->id_rx_nqueues; i++) {
3823                 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3824                 mutex_destroy(&rxp->rx_post_lock);
3825         }
3826         kmem_free(state->id_rx_queues, state->id_rx_nqueues *
3827             sizeof (ibd_rx_queue_t));
3828         kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
3829             sizeof (ibd_rwqe_t));
3830         kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
3831             state->id_rx_buf_sz);
3832         state->id_rx_queues = NULL;
3833         state->id_rx_wqes = NULL;
3834         state->id_rx_bufs = NULL;
3835 }
3836 
3837 static void
3838 ibd_free_rx_rsrcs(ibd_state_t *state)
3839 {
3840         mutex_enter(&state->id_rx_free_list.dl_mutex);
3841         if (state->id_rx_free_list.dl_head == NULL) {
3842                 /* already freed */
3843                 mutex_exit(&state->id_rx_free_list.dl_mutex);
3844                 return;
3845         }
3846         ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
3847         ibd_free_rx_copybufs(state);
3848         state->id_rx_free_list.dl_cnt = 0;
3849         state->id_rx_free_list.dl_head = NULL;
3850         mutex_exit(&state->id_rx_free_list.dl_mutex);
3851 }
3852 
3853 /*
3854  * Free the statically allocated Rx buffer list.
3855  */
3856 static void
3857 ibd_fini_rxlist(ibd_state_t *state)
3858 {
3859         ibd_rwqe_t *rwqe;
3860         int i;
3861 
3862         /* run through the rx_queue's, calling freemsg() */
3863         for (i = 0; i < state->id_rx_nqueues; i++) {
3864                 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3865                 mutex_enter(&rxp->rx_post_lock);
3866                 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
3867                     rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3868                         freemsg(rwqe->rwqe_im_mblk);
3869                         rxp->rx_cnt--;
3870                 }
3871                 rxp->rx_head = NULL;
3872                 mutex_exit(&rxp->rx_post_lock);
3873         }
3874 
3875         /* cannot free rx resources unless gld returned everything */
3876         if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
3877                 ibd_free_rx_rsrcs(state);
3878 }
3879 
3880 /*
3881  * Free an allocated recv wqe.
3882  */
3883 /* ARGSUSED */
3884 static void
3885 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3886 {
3887         /*
3888          * desballoc() failed (no memory).
3889          *
3890          * This rwqe is placed on a free list so that it
3891          * can be reinstated when memory is available.
3892          *
3893          * NOTE: no code currently exists to reinstate
3894          * these "lost" rwqes.
3895          */
3896         mutex_enter(&state->id_rx_free_list.dl_mutex);
3897         state->id_rx_free_list.dl_cnt++;
3898         rwqe->rwqe_next = state->id_rx_free_list.dl_head;
3899         state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
3900         mutex_exit(&state->id_rx_free_list.dl_mutex);
3901 }
3902 
3903 /*
3904  * IBA Rx completion queue handler. Guaranteed to be single
3905  * threaded and nonreentrant for this CQ.
3906  */
3907 /* ARGSUSED */
3908 static void
3909 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3910 {
3911         ibd_state_t *state = (ibd_state_t *)arg;
3912 
3913         atomic_inc_64(&state->id_num_intrs);
3914 
3915         if (ibd_rx_softintr == 1) {
3916                 mutex_enter(&state->id_rcq_poll_lock);
3917                 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
3918                         state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
3919                         mutex_exit(&state->id_rcq_poll_lock);
3920                         return;
3921                 } else {
3922                         mutex_exit(&state->id_rcq_poll_lock);
3923                         ddi_trigger_softintr(state->id_rx);
3924                 }
3925         } else
3926                 (void) ibd_intr((caddr_t)state);
3927 }
3928 
3929 /*
3930  * CQ handler for Tx completions, when the Tx CQ is in
3931  * interrupt driven mode.
3932  */
3933 /* ARGSUSED */
3934 static void
3935 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3936 {
3937         ibd_state_t *state = (ibd_state_t *)arg;
3938 
3939         atomic_inc_64(&state->id_num_intrs);
3940 
3941         if (ibd_tx_softintr == 1) {
3942                 mutex_enter(&state->id_scq_poll_lock);
3943                 if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
3944                         state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
3945                         mutex_exit(&state->id_scq_poll_lock);
3946                         return;
3947                 } else {
3948                         mutex_exit(&state->id_scq_poll_lock);
3949                         ddi_trigger_softintr(state->id_tx);
3950                 }
3951         } else
3952                 (void) ibd_tx_recycle((caddr_t)state);
3953 }
3954 
3955 /*
3956  * Multicast group create/delete trap handler. These will be delivered
3957  * on a kernel thread (handling can thus block) and can be invoked
3958  * concurrently. The handler can be invoked anytime after it is
3959  * registered and before ibt_detach().
3960  */
3961 /* ARGSUSED */
3962 static void
3963 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3964     ibt_subnet_event_t *event)
3965 {
3966         ibd_state_t *state = (ibd_state_t *)arg;
3967         ibd_req_t *req;
3968 
3969         /*
3970          * The trap handler will get invoked once for every event for
3971          * every port. The input "gid" is the GID0 of the port the
3972          * trap came in on; we just need to act on traps that came
3973          * to our port, meaning the port on which the ipoib interface
3974          * resides. Since ipoib uses GID0 of the port, we just match
3975          * the gids to check whether we need to handle the trap.
3976          */
3977         if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3978                 return;
3979 
3980         DPRINT(10, "ibd_notices_handler : %d\n", code);
3981 
3982         switch (code) {
3983                 case IBT_SM_EVENT_UNAVAILABLE:
3984                         /*
3985                          * If we are in promiscuous mode or have
3986                          * sendnonmembers, we need to print a warning
3987                          * message right now. Else, just store the
3988                          * information, print when we enter promiscuous
3989                          * mode or attempt nonmember send. We might
3990                          * also want to stop caching sendnonmember.
3991                          */
3992                         ibd_print_warn(state, "IBA multicast support "
3993                             "degraded due to unavailability of multicast "
3994                             "traps");
3995                         break;
3996                 case IBT_SM_EVENT_AVAILABLE:
3997                         /*
3998                          * If we printed a warning message above or
3999                          * while trying to nonmember send or get into
4000                          * promiscuous mode, print an okay message.
4001                          */
4002                         ibd_print_warn(state, "IBA multicast support "
4003                             "restored due to availability of multicast "
4004                             "traps");
4005                         break;
4006                 case IBT_SM_EVENT_MCG_CREATED:
4007                 case IBT_SM_EVENT_MCG_DELETED:
4008                         /*
4009                          * If it is a "deleted" event and we are in late hca
4010                          * init, nothing to do.
4011                          */
4012                         if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4013                             IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4014                             IBT_SM_EVENT_MCG_DELETED)) {
4015                                 break;
4016                         }
4017                         /*
4018                          * Common processing of creation/deletion traps.
4019                          * First check if the instance is being
4020                          * [de]initialized; back off then, without doing
4021                          * anything more, since we are not sure if the
4022                          * async thread is around, or whether we might
4023                          * be racing with the detach code in ibd_m_stop()
4024                          * that scans the mcg list.
4025                          */
4026                         if (!ibd_async_safe(state))
4027                                 return;
4028 
4029                         req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4030                         req->rq_gid = event->sm_notice_gid;
4031                         req->rq_ptr = (void *)code;
4032                         ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4033                         break;
4034         }
4035 }
4036 
4037 static void
4038 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4039 {
4040         ib_gid_t mgid = req->rq_gid;
4041         ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4042         int ret;
4043         ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4044 
4045         DPRINT(10, "ibd_async_trap : %d\n", code);
4046 
4047         /*
4048          * Check if we have already joined the IPoIB broadcast group for our
4049          * PKEY. If joined, perform the rest of the operation.
4050          * Else, the interface is not initialised. Do the initialisation here
4051          * by calling ibd_start() and return.
4052          */
4053 
4054         if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4055             IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4056             (code == IBT_SM_EVENT_MCG_CREATED)) {
4057                 /*
4058                  * If we are in late HCA init and a notification for the
4059                  * creation of a MCG came in, check if it is the IPoIB MCG for
4060                  * this pkey. If not, return.
4061                  */
4062                 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4063                     state->id_pkey)) {
4064                         ibd_async_done(state);
4065                         return;
4066                 }
4067                 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4068                 /*
4069                  * Check if there is still a necessity to start the interface.
4070                  * It is possible that the user attempted unplumb at just about
4071                  * the same time, and if unplumb succeeded, we have nothing to
4072                  * do.
4073                  */
4074                 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4075                     IBD_DRV_IN_LATE_HCA_INIT) &&
4076                     ((ret = ibd_start(state)) != 0)) {
4077                         DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4078                             "init, ret=%d", ret);
4079                 }
4080                 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4081                 ibd_async_done(state);
4082                 return;
4083         }
4084 
4085         /*
4086          * Atomically search the nonmember and sendonlymember lists and
4087          * delete.
4088          */
4089         ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4090 
4091         if (state->id_prom_op == IBD_OP_COMPLETED) {
4092                 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4093 
4094                 /*
4095                  * If in promiscuous mode, try to join/attach to the new
4096                  * mcg. Given the unreliable out-of-order mode of trap
4097                  * delivery, we can never be sure whether it is a problem
4098                  * if the join fails. Thus, we warn the admin of a failure
4099                  * if this was a creation trap. Note that the trap might
4100                  * actually be reporting a long past event, and the mcg
4101                  * might already have been deleted, thus we might be warning
4102                  * in vain.
4103                  */
4104                 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4105                     NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4106                         ibd_print_warn(state, "IBA promiscuous mode missed "
4107                             "new multicast gid %016llx:%016llx",
4108                             (u_longlong_t)mgid.gid_prefix,
4109                             (u_longlong_t)mgid.gid_guid);
4110         }
4111 
4112         /*
4113          * Free the request slot allocated by the subnet event thread.
4114          */
4115         ibd_async_done(state);
4116 }
4117 
4118 /*
4119  * GLDv3 entry point to get capabilities.
4120  */
4121 static boolean_t
4122 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4123 {
4124         ibd_state_t *state = arg;
4125 
4126         if (state->id_type == IBD_PORT_DRIVER)
4127                 return (B_FALSE);
4128 
4129         switch (cap) {
4130         case MAC_CAPAB_HCKSUM: {
4131                 uint32_t *txflags = cap_data;
4132 
4133                 /*
4134                  * We either do full checksum or not do it at all
4135                  */
4136                 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4137                         *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4138                 else
4139                         return (B_FALSE);
4140                 break;
4141         }
4142 
4143         case MAC_CAPAB_LSO: {
4144                 mac_capab_lso_t *cap_lso = cap_data;
4145 
4146                 /*
4147                  * In addition to the capability and policy, since LSO
4148                  * relies on hw checksum, we'll not enable LSO if we
4149                  * don't have hw checksum.  Of course, if the HCA doesn't
4150                  * provide the reserved lkey capability, enabling LSO will
4151                  * actually affect performance adversely, so we'll disable
4152                  * LSO even for that case.
4153                  */
4154                 if (!state->id_lso_policy || !state->id_lso_capable)
4155                         return (B_FALSE);
4156 
4157                 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4158                         return (B_FALSE);
4159 
4160                 if (state->id_hca_res_lkey_capab == 0) {
4161                         ibd_print_warn(state, "no reserved-lkey capability, "
4162                             "disabling LSO");
4163                         return (B_FALSE);
4164                 }
4165 
4166                 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4167                 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4168                 break;
4169         }
4170 
4171         default:
4172                 return (B_FALSE);
4173         }
4174 
4175         return (B_TRUE);
4176 }
4177 
4178 /*
4179  * callback function for set/get of properties
4180  */
4181 static int
4182 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4183     uint_t pr_valsize, const void *pr_val)
4184 {
4185         ibd_state_t *state = arg;
4186         int err = 0;
4187         uint32_t link_mode;
4188 
4189         /* Cannot set properties on a port driver */
4190         if (state->id_type == IBD_PORT_DRIVER) {
4191                 return (ENOTSUP);
4192         }
4193 
4194         switch (pr_num) {
4195                 case MAC_PROP_IB_LINKMODE:
4196                         if (state->id_mac_state & IBD_DRV_STARTED) {
4197                                 err = EBUSY;
4198                                 break;
4199                         }
4200                         if (pr_val == NULL) {
4201                                 err = EINVAL;
4202                                 break;
4203                         }
4204                         bcopy(pr_val, &link_mode, sizeof (link_mode));
4205                         if (link_mode != IBD_LINK_MODE_UD &&
4206                             link_mode != IBD_LINK_MODE_RC) {
4207                                 err = EINVAL;
4208                         } else {
4209                                 if (link_mode == IBD_LINK_MODE_RC) {
4210                                         if (state->id_enable_rc) {
4211                                                 return (0);
4212                                         }
4213                                         state->id_enable_rc = 1;
4214                                         /* inform MAC framework of new MTU */
4215                                         err = mac_maxsdu_update2(state->id_mh,
4216                                             state->rc_mtu - IPOIB_HDRSIZE,
4217                                             state->id_mtu - IPOIB_HDRSIZE);
4218                                 } else {
4219                                         if (!state->id_enable_rc) {
4220                                                 return (0);
4221                                         }
4222                                         state->id_enable_rc = 0;
4223                                         err = mac_maxsdu_update2(state->id_mh,
4224                                             state->id_mtu - IPOIB_HDRSIZE,
4225                                             state->id_mtu - IPOIB_HDRSIZE);
4226                                 }
4227                                 (void) ibd_record_capab(state);
4228                                 mac_capab_update(state->id_mh);
4229                         }
4230                         break;
4231                 case MAC_PROP_PRIVATE:
4232                         err = ibd_set_priv_prop(state, pr_name,
4233                             pr_valsize, pr_val);
4234                         break;
4235                 default:
4236                         err = ENOTSUP;
4237                         break;
4238         }
4239         return (err);
4240 }
4241 
4242 static int
4243 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4244     uint_t pr_valsize, void *pr_val)
4245 {
4246         ibd_state_t *state = arg;
4247         int err = 0;
4248 
4249         switch (pr_num) {
4250                 case MAC_PROP_MTU:
4251                         break;
4252                 default:
4253                         if (state->id_type == IBD_PORT_DRIVER) {
4254                                 return (ENOTSUP);
4255                         }
4256                         break;
4257         }
4258 
4259         switch (pr_num) {
4260                 case MAC_PROP_IB_LINKMODE:
4261                         *(uint_t *)pr_val = state->id_enable_rc;
4262                         break;
4263                 case MAC_PROP_PRIVATE:
4264                         err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4265                             pr_val);
4266                         break;
4267                 default:
4268                         err = ENOTSUP;
4269                         break;
4270         }
4271         return (err);
4272 }
4273 
4274 static void
4275 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4276     mac_prop_info_handle_t prh)
4277 {
4278         ibd_state_t *state = arg;
4279 
4280         switch (pr_num) {
4281         case MAC_PROP_IB_LINKMODE: {
4282                 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4283                 break;
4284         }
4285         case MAC_PROP_MTU: {
4286                 uint32_t min, max;
4287                 if (state->id_type == IBD_PORT_DRIVER) {
4288                         min = 1500;
4289                         max = IBD_DEF_RC_MAX_SDU;
4290                 } else if (state->id_enable_rc) {
4291                         min = max = IBD_DEF_RC_MAX_SDU;
4292                 } else {
4293                         min = max = state->id_mtu - IPOIB_HDRSIZE;
4294                 }
4295                 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4296                 mac_prop_info_set_range_uint32(prh, min, max);
4297                 break;
4298         }
4299         case MAC_PROP_PRIVATE: {
4300                 char valstr[64];
4301                 int value;
4302 
4303                 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4304                         mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4305                         return;
4306                 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4307                         value = IBD_DEF_COALESCE_COMPLETIONS;
4308                 } else if (strcmp(pr_name,
4309                     "_ibd_create_broadcast_group") == 0) {
4310                         value = IBD_DEF_CREATE_BCAST_GROUP;
4311                 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4312                         value = IBD_DEF_HASH_SIZE;
4313                 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4314                         value = IBD_DEF_LSO_POLICY;
4315                 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4316                         value = IBD_DEF_NUM_AH;
4317                 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4318                         value = IBD_DEF_NUM_LSO_BUFS;
4319                 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4320                         value = IBD_DEF_RC_ENABLE_SRQ;
4321                 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4322                         value = IBD_DEF_RC_NUM_RWQE;
4323                 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4324                         value = IBD_DEF_RC_NUM_SRQ;
4325                 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4326                         value = IBD_DEF_RC_NUM_SWQE;
4327                 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4328                         value = IBD_DEF_RC_RX_COMP_COUNT;
4329                 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4330                         value = IBD_DEF_RC_RX_COMP_USEC;
4331                 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4332                         value = IBD_DEF_RC_RX_COPY_THRESH;
4333                 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4334                         value = IBD_DEF_RC_RX_RWQE_THRESH;
4335                 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4336                         value = IBD_DEF_RC_TX_COMP_COUNT;
4337                 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4338                         value = IBD_DEF_RC_TX_COMP_USEC;
4339                 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4340                         value = IBD_DEF_RC_TX_COPY_THRESH;
4341                 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4342                         value = IBD_DEF_UD_NUM_RWQE;
4343                 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4344                         value = IBD_DEF_UD_NUM_SWQE;
4345                 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4346                         value = IBD_DEF_UD_RX_COMP_COUNT;
4347                 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4348                         value = IBD_DEF_UD_RX_COMP_USEC;
4349                 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4350                         value = IBD_DEF_UD_TX_COMP_COUNT;
4351                 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4352                         value = IBD_DEF_UD_TX_COMP_USEC;
4353                 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4354                         value = IBD_DEF_UD_TX_COPY_THRESH;
4355                 } else {
4356                         return;
4357                 }
4358 
4359                 (void) snprintf(valstr, sizeof (valstr), "%d", value);
4360                 mac_prop_info_set_default_str(prh, valstr);
4361                 break;
4362         }
4363         } /* switch (pr_num) */
4364 }
4365 
4366 /* ARGSUSED2 */
4367 static int
4368 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4369     uint_t pr_valsize, const void *pr_val)
4370 {
4371         int err = 0;
4372         long result;
4373 
4374         if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4375                 if (pr_val == NULL) {
4376                         return (EINVAL);
4377                 }
4378                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4379                 if (result < 0 || result > 1) {
4380                         err = EINVAL;
4381                 } else {
4382                         state->id_allow_coalesce_comp_tuning = (result == 1) ?
4383                             B_TRUE: B_FALSE;
4384                 }
4385                 return (err);
4386         }
4387         if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4388                 if (state->id_mac_state & IBD_DRV_STARTED) {
4389                         return (EBUSY);
4390                 }
4391                 if (pr_val == NULL) {
4392                         return (EINVAL);
4393                 }
4394                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4395                 if (result < 0 || result > 1) {
4396                         err = EINVAL;
4397                 } else {
4398                         state->id_create_broadcast_group = (result == 1) ?
4399                             B_TRUE: B_FALSE;
4400                 }
4401                 return (err);
4402         }
4403         if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4404                 if (state->id_mac_state & IBD_DRV_STARTED) {
4405                         return (EBUSY);
4406                 }
4407                 if (pr_val == NULL) {
4408                         return (EINVAL);
4409                 }
4410                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4411                 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4412                         err = EINVAL;
4413                 } else {
4414                         state->id_hash_size = (uint32_t)result;
4415                 }
4416                 return (err);
4417         }
4418         if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4419                 if (state->id_mac_state & IBD_DRV_STARTED) {
4420                         return (EBUSY);
4421                 }
4422                 if (pr_val == NULL) {
4423                         return (EINVAL);
4424                 }
4425                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4426                 if (result < 0 || result > 1) {
4427                         err = EINVAL;
4428                 } else {
4429                         state->id_lso_policy = (result == 1) ?
4430                             B_TRUE: B_FALSE;
4431                 }
4432                 mac_capab_update(state->id_mh);
4433                 return (err);
4434         }
4435         if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4436                 if (state->id_mac_state & IBD_DRV_STARTED) {
4437                         return (EBUSY);
4438                 }
4439                 if (pr_val == NULL) {
4440                         return (EINVAL);
4441                 }
4442                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4443                 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4444                         err = EINVAL;
4445                 } else {
4446                         state->id_num_ah = (uint32_t)result;
4447                 }
4448                 return (err);
4449         }
4450         if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4451                 if (state->id_mac_state & IBD_DRV_STARTED) {
4452                         return (EBUSY);
4453                 }
4454                 if (!state->id_lso_policy || !state->id_lso_capable) {
4455                         return (EINVAL);
4456                 }
4457                 if (pr_val == NULL) {
4458                         return (EINVAL);
4459                 }
4460                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4461                 if (result < IBD_MIN_NUM_LSO_BUFS ||
4462                     result > IBD_MAX_NUM_LSO_BUFS) {
4463                         err = EINVAL;
4464                 } else {
4465                         state->id_num_lso_bufs = (uint32_t)result;
4466                 }
4467                 return (err);
4468         }
4469         if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4470                 if (state->id_mac_state & IBD_DRV_STARTED) {
4471                         return (EBUSY);
4472                 }
4473                 if (pr_val == NULL) {
4474                         return (EINVAL);
4475                 }
4476                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4477                 if (result < 0 || result > 1) {
4478                         err = EINVAL;
4479                 } else {
4480                         state->rc_enable_srq = (result == 1) ?
4481                             B_TRUE: B_FALSE;
4482                 }
4483                 if (!state->rc_enable_srq) {
4484                         state->id_rc_num_srq = 0;
4485                 }
4486                 return (err);
4487         }
4488         if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4489                 if (state->id_mac_state & IBD_DRV_STARTED) {
4490                         return (EBUSY);
4491                 }
4492                 if (pr_val == NULL) {
4493                         return (EINVAL);
4494                 }
4495                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4496                 if (result < IBD_MIN_RC_NUM_RWQE ||
4497                     result > IBD_MAX_RC_NUM_RWQE) {
4498                         err = EINVAL;
4499                 } else {
4500                         state->id_rc_num_rwqe = (uint32_t)result;
4501                         if (state->id_allow_coalesce_comp_tuning &&
4502                             state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4503                                 state->id_rc_rx_comp_count =
4504                                     state->id_rc_num_rwqe;
4505                         if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4506                                 state->id_rc_num_srq =
4507                                     state->id_rc_num_rwqe - 1;
4508                         /*
4509                          * If rx_rwqe_threshold is greater than the number of
4510                          * rwqes, pull it back to 25% of number of rwqes.
4511                          */
4512                         if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4513                                 state->id_rc_rx_rwqe_thresh =
4514                                     (state->id_rc_num_rwqe >> 2);
4515 
4516                 }
4517                 return (err);
4518         }
4519         if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4520                 if (state->id_mac_state & IBD_DRV_STARTED) {
4521                         return (EBUSY);
4522                 }
4523                 if (pr_val == NULL) {
4524                         return (EINVAL);
4525                 }
4526                 if (!state->rc_enable_srq)
4527                         return (EINVAL);
4528 
4529                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4530                 if (result < IBD_MIN_RC_NUM_SRQ ||
4531                     result >= state->id_rc_num_rwqe) {
4532                         err = EINVAL;
4533                 } else
4534                         state->id_rc_num_srq = (uint32_t)result;
4535                 return (err);
4536         }
4537         if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4538                 if (state->id_mac_state & IBD_DRV_STARTED) {
4539                         return (EBUSY);
4540                 }
4541                 if (pr_val == NULL) {
4542                         return (EINVAL);
4543                 }
4544                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4545                 if (result < IBD_MIN_RC_NUM_SWQE ||
4546                     result > IBD_MAX_RC_NUM_SWQE) {
4547                         err = EINVAL;
4548                 } else {
4549                         state->id_rc_num_swqe = (uint32_t)result;
4550                         if (state->id_allow_coalesce_comp_tuning &&
4551                             state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4552                                 state->id_rc_tx_comp_count =
4553                                     state->id_rc_num_swqe;
4554                 }
4555                 return (err);
4556         }
4557         if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4558                 if (!state->id_allow_coalesce_comp_tuning) {
4559                         return (ENOTSUP);
4560                 }
4561                 if (pr_val == NULL) {
4562                         return (EINVAL);
4563                 }
4564                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4565                 if (result < 1 || result > state->id_rc_num_rwqe) {
4566                         err = EINVAL;
4567                 } else {
4568                         state->id_rc_rx_comp_count = (uint32_t)result;
4569                 }
4570                 return (err);
4571         }
4572         if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4573                 if (!state->id_allow_coalesce_comp_tuning) {
4574                         return (ENOTSUP);
4575                 }
4576                 if (pr_val == NULL) {
4577                         return (EINVAL);
4578                 }
4579                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4580                 if (result < 1) {
4581                         err = EINVAL;
4582                 } else {
4583                         state->id_rc_rx_comp_usec = (uint32_t)result;
4584                 }
4585                 return (err);
4586         }
4587         if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4588                 if (state->id_mac_state & IBD_DRV_STARTED) {
4589                         return (EBUSY);
4590                 }
4591                 if (pr_val == NULL) {
4592                         return (EINVAL);
4593                 }
4594                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4595                 if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4596                     result > state->rc_mtu) {
4597                         err = EINVAL;
4598                 } else {
4599                         state->id_rc_rx_copy_thresh = (uint32_t)result;
4600                 }
4601                 return (err);
4602         }
4603         if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4604                 if (state->id_mac_state & IBD_DRV_STARTED) {
4605                         return (EBUSY);
4606                 }
4607                 if (pr_val == NULL) {
4608                         return (EINVAL);
4609                 }
4610                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4611                 if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4612                     result >= state->id_rc_num_rwqe) {
4613                         err = EINVAL;
4614                 } else {
4615                         state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4616                 }
4617                 return (err);
4618         }
4619         if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4620                 if (!state->id_allow_coalesce_comp_tuning) {
4621                         return (ENOTSUP);
4622                 }
4623                 if (pr_val == NULL) {
4624                         return (EINVAL);
4625                 }
4626                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4627                 if (result < 1 || result > state->id_rc_num_swqe) {
4628                         err = EINVAL;
4629                 } else {
4630                         state->id_rc_tx_comp_count = (uint32_t)result;
4631                 }
4632                 return (err);
4633         }
4634         if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4635                 if (!state->id_allow_coalesce_comp_tuning) {
4636                         return (ENOTSUP);
4637                 }
4638                 if (pr_val == NULL) {
4639                         return (EINVAL);
4640                 }
4641                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4642                 if (result < 1)
4643                         err = EINVAL;
4644                 else {
4645                         state->id_rc_tx_comp_usec = (uint32_t)result;
4646                 }
4647                 return (err);
4648         }
4649         if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4650                 if (state->id_mac_state & IBD_DRV_STARTED) {
4651                         return (EBUSY);
4652                 }
4653                 if (pr_val == NULL) {
4654                         return (EINVAL);
4655                 }
4656                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4657                 if (result < IBD_MIN_RC_TX_COPY_THRESH ||
4658                     result > state->rc_mtu) {
4659                         err = EINVAL;
4660                 } else {
4661                         state->id_rc_tx_copy_thresh = (uint32_t)result;
4662                 }
4663                 return (err);
4664         }
4665         if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4666                 if (state->id_mac_state & IBD_DRV_STARTED) {
4667                         return (EBUSY);
4668                 }
4669                 if (pr_val == NULL) {
4670                         return (EINVAL);
4671                 }
4672                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4673                 if (result < IBD_MIN_UD_NUM_RWQE ||
4674                     result > IBD_MAX_UD_NUM_RWQE) {
4675                         err = EINVAL;
4676                 } else {
4677                         if (result > state->id_hca_max_chan_sz) {
4678                                 state->id_ud_num_rwqe =
4679                                     state->id_hca_max_chan_sz;
4680                         } else {
4681                                 state->id_ud_num_rwqe = (uint32_t)result;
4682                         }
4683                         if (state->id_allow_coalesce_comp_tuning &&
4684                             state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
4685                                 state->id_ud_rx_comp_count =
4686                                     state->id_ud_num_rwqe;
4687                 }
4688                 return (err);
4689         }
4690         if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4691                 if (state->id_mac_state & IBD_DRV_STARTED) {
4692                         return (EBUSY);
4693                 }
4694                 if (pr_val == NULL) {
4695                         return (EINVAL);
4696                 }
4697                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4698                 if (result < IBD_MIN_UD_NUM_SWQE ||
4699                     result > IBD_MAX_UD_NUM_SWQE) {
4700                         err = EINVAL;
4701                 } else {
4702                         if (result > state->id_hca_max_chan_sz) {
4703                                 state->id_ud_num_swqe =
4704                                     state->id_hca_max_chan_sz;
4705                         } else {
4706                                 state->id_ud_num_swqe = (uint32_t)result;
4707                         }
4708                         if (state->id_allow_coalesce_comp_tuning &&
4709                             state->id_ud_tx_comp_count > state->id_ud_num_swqe)
4710                                 state->id_ud_tx_comp_count =
4711                                     state->id_ud_num_swqe;
4712                 }
4713                 return (err);
4714         }
4715         if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4716                 if (!state->id_allow_coalesce_comp_tuning) {
4717                         return (ENOTSUP);
4718                 }
4719                 if (pr_val == NULL) {
4720                         return (EINVAL);
4721                 }
4722                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4723                 if (result < 1 || result > state->id_ud_num_rwqe) {
4724                         err = EINVAL;
4725                 } else {
4726                         state->id_ud_rx_comp_count = (uint32_t)result;
4727                 }
4728                 return (err);
4729         }
4730         if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4731                 if (!state->id_allow_coalesce_comp_tuning) {
4732                         return (ENOTSUP);
4733                 }
4734                 if (pr_val == NULL) {
4735                         return (EINVAL);
4736                 }
4737                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4738                 if (result < 1) {
4739                         err = EINVAL;
4740                 } else {
4741                         state->id_ud_rx_comp_usec = (uint32_t)result;
4742                 }
4743                 return (err);
4744         }
4745         if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4746                 if (!state->id_allow_coalesce_comp_tuning) {
4747                         return (ENOTSUP);
4748                 }
4749                 if (pr_val == NULL) {
4750                         return (EINVAL);
4751                 }
4752                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4753                 if (result < 1 || result > state->id_ud_num_swqe) {
4754                         err = EINVAL;
4755                 } else {
4756                         state->id_ud_tx_comp_count = (uint32_t)result;
4757                 }
4758                 return (err);
4759         }
4760         if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4761                 if (!state->id_allow_coalesce_comp_tuning) {
4762                         return (ENOTSUP);
4763                 }
4764                 if (pr_val == NULL) {
4765                         return (EINVAL);
4766                 }
4767                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4768                 if (result < 1) {
4769                         err = EINVAL;
4770                 } else {
4771                         state->id_ud_tx_comp_usec = (uint32_t)result;
4772                 }
4773                 return (err);
4774         }
4775         if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4776                 if (state->id_mac_state & IBD_DRV_STARTED) {
4777                         return (EBUSY);
4778                 }
4779                 if (pr_val == NULL) {
4780                         return (EINVAL);
4781                 }
4782                 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4783                 if (result < IBD_MIN_UD_TX_COPY_THRESH ||
4784                     result > IBD_MAX_UD_TX_COPY_THRESH) {
4785                         err = EINVAL;
4786                 } else {
4787                         state->id_ud_tx_copy_thresh = (uint32_t)result;
4788                 }
4789                 return (err);
4790         }
4791         return (ENOTSUP);
4792 }
4793 
4794 static int
4795 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
4796     void *pr_val)
4797 {
4798         int err = ENOTSUP;
4799         int value;
4800 
4801         if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4802                 value = state->id_bgroup_present;
4803                 err = 0;
4804                 goto done;
4805         }
4806         if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4807                 value = state->id_allow_coalesce_comp_tuning;
4808                 err = 0;
4809                 goto done;
4810         }
4811         if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4812                 value = state->id_create_broadcast_group;
4813                 err = 0;
4814                 goto done;
4815         }
4816         if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4817                 value = state->id_hash_size;
4818                 err = 0;
4819                 goto done;
4820         }
4821         if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4822                 value = state->id_lso_policy;
4823                 err = 0;
4824                 goto done;
4825         }
4826         if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4827                 value = state->id_num_ah;
4828                 err = 0;
4829                 goto done;
4830         }
4831         if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4832                 value = state->id_num_lso_bufs;
4833                 err = 0;
4834                 goto done;
4835         }
4836         if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4837                 value = state->rc_enable_srq;
4838                 err = 0;
4839                 goto done;
4840         }
4841         if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4842                 value = state->id_rc_num_rwqe;
4843                 err = 0;
4844                 goto done;
4845         }
4846         if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4847                 value = state->id_rc_num_srq;
4848                 err = 0;
4849                 goto done;
4850         }
4851         if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4852                 value = state->id_rc_num_swqe;
4853                 err = 0;
4854                 goto done;
4855         }
4856         if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4857                 value = state->id_rc_rx_comp_count;
4858                 err = 0;
4859                 goto done;
4860         }
4861         if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4862                 value = state->id_rc_rx_comp_usec;
4863                 err = 0;
4864                 goto done;
4865         }
4866         if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4867                 value = state->id_rc_rx_copy_thresh;
4868                 err = 0;
4869                 goto done;
4870         }
4871         if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4872                 value = state->id_rc_rx_rwqe_thresh;
4873                 err = 0;
4874                 goto done;
4875         }
4876         if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4877                 value = state->id_rc_tx_comp_count;
4878                 err = 0;
4879                 goto done;
4880         }
4881         if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4882                 value = state->id_rc_tx_comp_usec;
4883                 err = 0;
4884                 goto done;
4885         }
4886         if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4887                 value = state->id_rc_tx_copy_thresh;
4888                 err = 0;
4889                 goto done;
4890         }
4891         if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4892                 value = state->id_ud_num_rwqe;
4893                 err = 0;
4894                 goto done;
4895         }
4896         if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4897                 value = state->id_ud_num_swqe;
4898                 err = 0;
4899                 goto done;
4900         }
4901         if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4902                 value = state->id_ud_rx_comp_count;
4903                 err = 0;
4904                 goto done;
4905         }
4906         if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4907                 value = state->id_ud_rx_comp_usec;
4908                 err = 0;
4909                 goto done;
4910         }
4911         if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4912                 value = state->id_ud_tx_comp_count;
4913                 err = 0;
4914                 goto done;
4915         }
4916         if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4917                 value = state->id_ud_tx_comp_usec;
4918                 err = 0;
4919                 goto done;
4920         }
4921         if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4922                 value = state->id_ud_tx_copy_thresh;
4923                 err = 0;
4924                 goto done;
4925         }
4926 done:
4927         if (err == 0) {
4928                 (void) snprintf(pr_val, pr_valsize, "%d", value);
4929         }
4930         return (err);
4931 }
4932 
4933 static int
4934 ibd_get_port_details(ibd_state_t *state)
4935 {
4936         ibt_hca_portinfo_t *port_infop;
4937         ibt_status_t ret;
4938         uint_t psize, port_infosz;
4939 
4940         mutex_enter(&state->id_link_mutex);
4941 
4942         /*
4943          * Query for port information
4944          */
4945         ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4946             &port_infop, &psize, &port_infosz);
4947         if ((ret != IBT_SUCCESS) || (psize != 1)) {
4948                 mutex_exit(&state->id_link_mutex);
4949                 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4950                     "failed, ret=%d", ret);
4951                 return (ENETDOWN);
4952         }
4953 
4954         /*
4955          * If the link is active, verify the pkey
4956          */
4957         if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
4958                 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4959                     state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4960                         state->id_link_state = LINK_STATE_DOWN;
4961                 } else {
4962                         state->id_link_state = LINK_STATE_UP;
4963                 }
4964                 state->id_mtu = (128 << port_infop->p_mtu);
4965                 state->id_sgid = *port_infop->p_sgid_tbl;
4966                 /*
4967                  * Now that the port is active, record the port speed
4968                  */
4969                 state->id_link_speed = ibd_get_portspeed(state);
4970         } else {
4971                 /* Make sure that these are handled in PORT_UP/CHANGE */
4972                 state->id_mtu = 0;
4973                 state->id_link_state = LINK_STATE_DOWN;
4974                 state->id_link_speed = 0;
4975         }
4976         mutex_exit(&state->id_link_mutex);
4977         ibt_free_portinfo(port_infop, port_infosz);
4978 
4979         return (0);
4980 }
4981 
4982 static int
4983 ibd_alloc_cqs(ibd_state_t *state)
4984 {
4985         ibt_hca_attr_t hca_attrs;
4986         ibt_cq_attr_t cq_attr;
4987         ibt_status_t ret;
4988         uint32_t real_size;
4989         uint_t num_rwqe_change = 0;
4990         uint_t num_swqe_change = 0;
4991 
4992         ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4993         ASSERT(ret == IBT_SUCCESS);
4994 
4995         /*
4996          * Allocate Rx/combined CQ:
4997          * Theoretically, there is no point in having more than #rwqe
4998          * plus #swqe cqe's, except that the CQ will be signaled for
4999          * overflow when the last wqe completes, if none of the previous
5000          * cqe's have been polled. Thus, we allocate just a few less wqe's
5001          * to make sure such overflow does not occur.
5002          */
5003         cq_attr.cq_sched = NULL;
5004         cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5005 
5006         /*
5007          * Allocate Receive CQ.
5008          */
5009         if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5010                 cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5011         } else {
5012                 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5013                 num_rwqe_change = state->id_ud_num_rwqe;
5014                 state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5015         }
5016 
5017         if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5018             &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5019                 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5020                     "failed, ret=%d\n", ret);
5021                 return (DDI_FAILURE);
5022         }
5023 
5024         if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5025             state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5026                 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5027                     "moderation failed, ret=%d\n", ret);
5028         }
5029 
5030         /* make the #rx wc's the same as max rx chain size */
5031         state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5032         state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5033             state->id_rxwcs_size, KM_SLEEP);
5034 
5035         /*
5036          * Allocate Send CQ.
5037          */
5038         if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5039                 cq_attr.cq_size = state->id_ud_num_swqe + 1;
5040         } else {
5041                 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5042                 num_swqe_change = state->id_ud_num_swqe;
5043                 state->id_ud_num_swqe = cq_attr.cq_size - 1;
5044         }
5045 
5046         if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5047             &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5048                 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5049                     "failed, ret=%d\n", ret);
5050                 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5051                     state->id_rxwcs_size);
5052                 (void) ibt_free_cq(state->id_rcq_hdl);
5053                 return (DDI_FAILURE);
5054         }
5055         if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5056             state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5057                 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5058                     "moderation failed, ret=%d\n", ret);
5059         }
5060 
5061         state->id_txwcs_size = IBD_TX_POLL_THRESH;
5062         state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5063             state->id_txwcs_size, KM_SLEEP);
5064 
5065         /*
5066          * Print message in case we could not allocate as many wqe's
5067          * as was requested.
5068          */
5069         if (num_rwqe_change) {
5070                 ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5071                     "%d", state->id_ud_num_rwqe, num_rwqe_change);
5072         }
5073         if (num_swqe_change) {
5074                 ibd_print_warn(state, "Setting #swqe = %d instead of default "
5075                     "%d", state->id_ud_num_swqe, num_swqe_change);
5076         }
5077 
5078         return (DDI_SUCCESS);
5079 }
5080 
5081 static int
5082 ibd_setup_ud_channel(ibd_state_t *state)
5083 {
5084         ibt_ud_chan_alloc_args_t ud_alloc_attr;
5085         ibt_ud_chan_query_attr_t ud_chan_attr;
5086         ibt_status_t ret;
5087 
5088         ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5089         if (state->id_hca_res_lkey_capab)
5090                 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5091         if (state->id_lso_policy && state->id_lso_capable)
5092                 ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5093 
5094         ud_alloc_attr.ud_hca_port_num   = state->id_port;
5095         ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5096         ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5097         ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5098         ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5099         ud_alloc_attr.ud_qkey           = state->id_mcinfo->mc_qkey;
5100         ud_alloc_attr.ud_scq            = state->id_scq_hdl;
5101         ud_alloc_attr.ud_rcq            = state->id_rcq_hdl;
5102         ud_alloc_attr.ud_pd             = state->id_pd_hdl;
5103         ud_alloc_attr.ud_pkey_ix        = state->id_pkix;
5104         ud_alloc_attr.ud_clone_chan     = NULL;
5105 
5106         if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5107             &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5108                 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5109                     "failed, ret=%d\n", ret);
5110                 return (DDI_FAILURE);
5111         }
5112 
5113         if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5114             &ud_chan_attr)) != IBT_SUCCESS) {
5115                 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5116                     "failed, ret=%d\n", ret);
5117                 (void) ibt_free_channel(state->id_chnl_hdl);
5118                 return (DDI_FAILURE);
5119         }
5120 
5121         state->id_qpnum = ud_chan_attr.ud_qpn;
5122 
5123         return (DDI_SUCCESS);
5124 }
5125 
5126 static int
5127 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5128 {
5129         uint32_t progress = state->id_mac_state;
5130         uint_t attempts;
5131         ibt_status_t ret;
5132         ib_gid_t mgid;
5133         ibd_mce_t *mce;
5134         uint8_t jstate;
5135         timeout_id_t tid;
5136 
5137         if (atomic_dec_32_nv(&state->id_running) != 0)
5138                 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5139 
5140         /*
5141          * Before we try to stop/undo whatever we did in ibd_start(),
5142          * we need to mark the link state appropriately to prevent the
5143          * ip layer from using this instance for any new transfers. Note
5144          * that if the original state of the link was "up" when we're
5145          * here, we'll set the final link state to "unknown", to behave
5146          * in the same fashion as other ethernet drivers.
5147          */
5148         mutex_enter(&state->id_link_mutex);
5149         if (cur_link_state == LINK_STATE_DOWN) {
5150                 state->id_link_state = cur_link_state;
5151         } else {
5152                 state->id_link_state = LINK_STATE_UNKNOWN;
5153         }
5154         mutex_exit(&state->id_link_mutex);
5155         bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5156         mac_link_update(state->id_mh, state->id_link_state);
5157 
5158         state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5159         if (progress & IBD_DRV_STARTED) {
5160                 state->id_mac_state &= (~IBD_DRV_STARTED);
5161         }
5162 
5163         if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5164                 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5165         }
5166 
5167         /* Stop listen under Reliable Connected Mode */
5168         if (progress & IBD_DRV_RC_LISTEN) {
5169                 ASSERT(state->id_enable_rc);
5170                 if (state->rc_listen_hdl != NULL) {
5171                         ibd_rc_stop_listen(state);
5172                 }
5173                 state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5174         }
5175 
5176         /* Stop timeout routine */
5177         if (progress & IBD_DRV_RC_TIMEOUT) {
5178                 ASSERT(state->id_enable_rc);
5179                 mutex_enter(&state->rc_timeout_lock);
5180                 state->rc_timeout_start = B_FALSE;
5181                 tid = state->rc_timeout;
5182                 state->rc_timeout = 0;
5183                 mutex_exit(&state->rc_timeout_lock);
5184                 if (tid != 0)
5185                         (void) untimeout(tid);
5186                 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5187         }
5188 
5189         if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5190                 attempts = 100;
5191                 while (state->id_ah_op == IBD_OP_ONGOING) {
5192                         /*
5193                          * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5194                          * port is connecting to a remote IPoIB port. Wait for
5195                          * the end of this connecting operation.
5196                          */
5197                         delay(drv_usectohz(100000));
5198                         if (--attempts == 0) {
5199                                 state->rc_stop_connect++;
5200                                 DPRINT(40, "ibd_undo_start: connecting");
5201                                 break;
5202                         }
5203                 }
5204                 mutex_enter(&state->id_sched_lock);
5205                 state->id_sched_needed = 0;
5206                 mutex_exit(&state->id_sched_lock);
5207                 (void) ibd_rc_close_all_chan(state);
5208         }
5209 
5210         /*
5211          * First, stop receive interrupts; this stops the driver from
5212          * handing up buffers to higher layers.  Wait for receive buffers
5213          * to be returned and give up after 1 second.
5214          */
5215         if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5216                 attempts = 10;
5217                 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5218                     0) > 0) {
5219                         delay(drv_usectohz(100000));
5220                         if (--attempts == 0) {
5221                                 /*
5222                                  * There are pending bufs with the network
5223                                  * layer and we have no choice but to wait
5224                                  * for them to be done with. Reap all the
5225                                  * Tx/Rx completions that were posted since
5226                                  * we turned off the notification and
5227                                  * return failure.
5228                                  */
5229                                 cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5230                                 DPRINT(2, "ibd_undo_start: "
5231                                     "reclaiming failed");
5232                                 break;
5233                         }
5234                 }
5235                 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5236         }
5237 
5238         if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5239                 ibd_rc_fini_tx_largebuf_list(state);
5240                 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5241         }
5242 
5243         if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5244                 ASSERT(state->id_enable_rc);
5245                 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5246                         if (state->id_ah_op == IBD_OP_ONGOING) {
5247                                 delay(drv_usectohz(10000));
5248                                 if (state->id_ah_op == IBD_OP_ONGOING) {
5249                                         /*
5250                                          * "state->id_ah_op == IBD_OP_ONGOING"
5251                                          * means this IPoIB port is connecting
5252                                          * to a remote IPoIB port. We can't
5253                                          * delete SRQ here.
5254                                          */
5255                                         state->rc_stop_connect++;
5256                                         DPRINT(40, "ibd_undo_start: "
5257                                             "connecting");
5258                                 } else {
5259                                         ibd_rc_fini_srq_list(state);
5260                                         state->id_mac_state &=
5261                                             (~IBD_DRV_RC_SRQ_ALLOCD);
5262                                 }
5263                         } else {
5264                                 ibd_rc_fini_srq_list(state);
5265                                 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5266                         }
5267                 } else {
5268                         DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5269                 }
5270         }
5271 
5272         if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5273                 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5274 
5275                 mutex_enter(&state->id_trap_lock);
5276                 state->id_trap_stop = B_TRUE;
5277                 while (state->id_trap_inprog > 0)
5278                         cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5279                 mutex_exit(&state->id_trap_lock);
5280 
5281                 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5282         }
5283 
5284         if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5285                 /*
5286                  * Flushing the channel ensures that all pending WQE's
5287                  * are marked with flush_error and handed to the CQ. It
5288                  * does not guarantee the invocation of the CQ handler.
5289                  * This call is guaranteed to return successfully for
5290                  * UD QPNs.
5291                  */
5292                 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5293                     IBT_SUCCESS) {
5294                         DPRINT(10, "ibd_undo_start: flush_channel "
5295                             "failed, ret=%d", ret);
5296                 }
5297 
5298                 /*
5299                  * Give some time for the TX CQ handler to process the
5300                  * completions.
5301                  */
5302                 attempts = 10;
5303                 mutex_enter(&state->id_tx_list.dl_mutex);
5304                 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5305                 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5306                     != state->id_ud_num_swqe) {
5307                         if (--attempts == 0)
5308                                 break;
5309                         mutex_exit(&state->id_tx_rel_list.dl_mutex);
5310                         mutex_exit(&state->id_tx_list.dl_mutex);
5311                         delay(drv_usectohz(100000));
5312                         mutex_enter(&state->id_tx_list.dl_mutex);
5313                         mutex_enter(&state->id_tx_rel_list.dl_mutex);
5314                 }
5315                 ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5316                 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5317                     state->id_ud_num_swqe) {
5318                         cmn_err(CE_WARN, "tx resources not freed\n");
5319                 }
5320                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5321                 mutex_exit(&state->id_tx_list.dl_mutex);
5322 
5323                 attempts = 10;
5324                 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5325                         if (--attempts == 0)
5326                                 break;
5327                         delay(drv_usectohz(100000));
5328                 }
5329                 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5330                 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5331                         cmn_err(CE_WARN, "rx resources not freed\n");
5332                 }
5333 
5334                 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5335         }
5336 
5337         if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5338                 /*
5339                  * Drop all residual full/non membership. This includes full
5340                  * membership to the broadcast group, and any nonmembership
5341                  * acquired during transmits. We do this after the Tx completion
5342                  * handlers are done, since those might result in some late
5343                  * leaves; this also eliminates a potential race with that
5344                  * path wrt the mc full list insert/delete. Trap handling
5345                  * has also been suppressed at this point. Thus, no locks
5346                  * are required while traversing the mc full list.
5347                  */
5348                 DPRINT(2, "ibd_undo_start: clear full cache entries");
5349                 mce = list_head(&state->id_mc_full);
5350                 while (mce != NULL) {
5351                         mgid = mce->mc_info.mc_adds_vect.av_dgid;
5352                         jstate = mce->mc_jstate;
5353                         mce = list_next(&state->id_mc_full, mce);
5354                         ibd_leave_group(state, mgid, jstate);
5355                 }
5356                 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5357         }
5358 
5359         if (progress & IBD_DRV_RXLIST_ALLOCD) {
5360                 ibd_fini_rxlist(state);
5361                 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5362         }
5363 
5364         if (progress & IBD_DRV_TXLIST_ALLOCD) {
5365                 ibd_fini_txlist(state);
5366                 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5367         }
5368 
5369         if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5370                 if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5371                     IBT_SUCCESS) {
5372                         DPRINT(10, "ibd_undo_start: free_channel "
5373                             "failed, ret=%d", ret);
5374                 }
5375 
5376                 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5377         }
5378 
5379         if (progress & IBD_DRV_CQS_ALLOCD) {
5380                 kmem_free(state->id_txwcs,
5381                     sizeof (ibt_wc_t) * state->id_txwcs_size);
5382                 if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5383                     IBT_SUCCESS) {
5384                         DPRINT(10, "ibd_undo_start: free_cq(scq) "
5385                             "failed, ret=%d", ret);
5386                 }
5387 
5388                 kmem_free(state->id_rxwcs,
5389                     sizeof (ibt_wc_t) * state->id_rxwcs_size);
5390                 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5391                         DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5392                             "ret=%d", ret);
5393                 }
5394 
5395                 state->id_txwcs = NULL;
5396                 state->id_rxwcs = NULL;
5397                 state->id_scq_hdl = NULL;
5398                 state->id_rcq_hdl = NULL;
5399 
5400                 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5401         }
5402 
5403         if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5404                 mutex_enter(&state->id_ac_mutex);
5405                 mod_hash_destroy_hash(state->id_ah_active_hash);
5406                 mutex_exit(&state->id_ac_mutex);
5407                 ibd_acache_fini(state);
5408 
5409                 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5410         }
5411 
5412         if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5413                 /*
5414                  * If we'd created the ipoib broadcast group and had
5415                  * successfully joined it, leave it now
5416                  */
5417                 if (state->id_bgroup_created) {
5418                         mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5419                         jstate = IB_MC_JSTATE_FULL;
5420                         (void) ibt_leave_mcg(state->id_sgid, mgid,
5421                             state->id_sgid, jstate);
5422                 }
5423                 ibt_free_mcg_info(state->id_mcinfo, 1);
5424 
5425                 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5426         }
5427 
5428         return (DDI_SUCCESS);
5429 }
5430 
5431 /*
5432  * These pair of routines are used to set/clear the condition that
5433  * the caller is likely to do something to change the id_mac_state.
5434  * If there's already someone doing either a start or a stop (possibly
5435  * due to the async handler detecting a pkey relocation event, a plumb
5436  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5437  * that's done.
5438  */
5439 static void
5440 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5441 {
5442         mutex_enter(&state->id_macst_lock);
5443         while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5444                 cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5445 
5446         state->id_mac_state |= flag;
5447         mutex_exit(&state->id_macst_lock);
5448 }
5449 
5450 static void
5451 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5452 {
5453         mutex_enter(&state->id_macst_lock);
5454         state->id_mac_state &= (~flag);
5455         cv_signal(&state->id_macst_cv);
5456         mutex_exit(&state->id_macst_lock);
5457 }
5458 
5459 /*
5460  * GLDv3 entry point to start hardware.
5461  */
5462 /*ARGSUSED*/
5463 static int
5464 ibd_m_start(void *arg)
5465 {
5466         ibd_state_t *state = arg;
5467         int     ret;
5468 
5469         if (state->id_type == IBD_PORT_DRIVER)
5470                 return (EINVAL);
5471 
5472         ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5473         if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5474                 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5475                 return (EIO);
5476         }
5477 
5478         ret = ibd_start(state);
5479         ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5480         return (ret);
5481 }
5482 
5483 static int
5484 ibd_start(ibd_state_t *state)
5485 {
5486         int err;
5487         ibt_status_t ret;
5488         int late_hca_init = 0;
5489 
5490         if (state->id_mac_state & IBD_DRV_STARTED)
5491                 return (DDI_SUCCESS);
5492 
5493         /*
5494          * We do not increment the running flag when calling ibd_start() as
5495          * a result of some event which moves the state away from late HCA
5496          * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5497          */
5498         if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5499             (atomic_inc_32_nv(&state->id_running) != 1)) {
5500                 DPRINT(10, "ibd_start: id_running is non-zero");
5501                 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5502                 atomic_dec_32(&state->id_running);
5503                 return (EINVAL);
5504         }
5505 
5506         /*
5507          * Get port details; if we fail here, something bad happened.
5508          * Fail plumb.
5509          */
5510         if ((err = ibd_get_port_details(state)) != 0) {
5511                 DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5512                 goto start_fail;
5513         }
5514         /*
5515          * If state->id_link_state is DOWN, it indicates that either the port
5516          * is down, or the pkey is not available. In both cases, resort to late
5517          * initialization. Register for subnet notices, and return success.
5518          */
5519         state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5520         if (state->id_link_state == LINK_STATE_DOWN) {
5521                 late_hca_init = 1;
5522                 goto late_hca_init_return;
5523         }
5524 
5525         /*
5526          * Find the IPoIB broadcast group
5527          */
5528         if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5529                 /* Resort to late initialization */
5530                 late_hca_init = 1;
5531                 goto reg_snet_notices;
5532         }
5533         state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5534 
5535         /*
5536          * Initialize per-interface caches and lists; if we fail here,
5537          * it is most likely due to a lack of resources
5538          */
5539         if (ibd_acache_init(state) != DDI_SUCCESS) {
5540                 DPRINT(10, "ibd_start: ibd_acache_init() failed");
5541                 err = ENOMEM;
5542                 goto start_fail;
5543         }
5544         state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5545 
5546         /*
5547          * Allocate send and receive completion queues
5548          */
5549         if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5550                 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5551                 err = ENOMEM;
5552                 goto start_fail;
5553         }
5554         state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5555 
5556         /*
5557          * Setup a UD channel
5558          */
5559         if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5560                 err = ENOMEM;
5561                 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5562                 goto start_fail;
5563         }
5564         state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5565 
5566         /*
5567          * Allocate and initialize the tx buffer list
5568          */
5569         if (ibd_init_txlist(state) != DDI_SUCCESS) {
5570                 DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5571                 err = ENOMEM;
5572                 goto start_fail;
5573         }
5574         state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5575 
5576         /*
5577          * Create the send cq handler here
5578          */
5579         ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5580         if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5581             IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5582                 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5583                     "failed, ret=%d", ret);
5584                 err = EINVAL;
5585                 goto start_fail;
5586         }
5587         state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5588 
5589         /*
5590          * Allocate and initialize the rx buffer list
5591          */
5592         if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5593                 DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5594                 err = ENOMEM;
5595                 goto start_fail;
5596         }
5597         state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5598 
5599         /*
5600          * Join IPoIB broadcast group
5601          */
5602         if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5603                 DPRINT(10, "ibd_start: ibd_join_group() failed");
5604                 err = ENOTACTIVE;
5605                 goto start_fail;
5606         }
5607         state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5608 
5609         /*
5610          * When we did mac_register() in ibd_attach(), we didn't register
5611          * the real macaddr and we didn't have the true port mtu. Now that
5612          * we're almost ready, set the local mac address and broadcast
5613          * addresses and update gldv3 about the real values of these
5614          * parameters.
5615          */
5616         if (state->id_enable_rc) {
5617                 ibd_h2n_mac(&state->id_macaddr,
5618                     IBD_MAC_ADDR_RC + state->id_qpnum,
5619                     state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5620                 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
5621                     state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5622         } else {
5623                 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
5624                     state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5625         }
5626         ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
5627             state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
5628 
5629         if (!state->id_enable_rc) {
5630                 (void) mac_maxsdu_update2(state->id_mh,
5631                     state->id_mtu - IPOIB_HDRSIZE,
5632                     state->id_mtu - IPOIB_HDRSIZE);
5633         }
5634         mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
5635 
5636         /*
5637          * Setup the receive cq handler
5638          */
5639         ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5640         if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
5641             IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5642                 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
5643                     "failed, ret=%d", ret);
5644                 err = EINVAL;
5645                 goto start_fail;
5646         }
5647         state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
5648 
5649 reg_snet_notices:
5650         /*
5651          * In case of normal initialization sequence,
5652          * Setup the subnet notices handler after we've initialized the acache/
5653          * mcache and started the async thread, both of which are required for
5654          * the trap handler to function properly.
5655          *
5656          * Now that the async thread has been started (and we've already done
5657          * a mac_register() during attach so mac_tx_update() can be called
5658          * if necessary without any problem), we can enable the trap handler
5659          * to queue requests to the async thread.
5660          *
5661          * In case of late hca initialization, the subnet notices handler will
5662          * only handle MCG created/deleted event. The action performed as part
5663          * of handling these events is to start the interface. So, the
5664          * acache/mcache initialization is not a necessity in such cases for
5665          * registering the subnet notices handler. Also, if we are in
5666          * ibd_start() as a result of, say, some event handling after entering
5667          * late hca initialization phase no need to register again.
5668          */
5669         if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
5670                 ibt_register_subnet_notices(state->id_ibt_hdl,
5671                     ibd_snet_notices_handler, state);
5672                 mutex_enter(&state->id_trap_lock);
5673                 state->id_trap_stop = B_FALSE;
5674                 mutex_exit(&state->id_trap_lock);
5675                 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
5676         }
5677 
5678 late_hca_init_return:
5679         if (late_hca_init == 1) {
5680                 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
5681                 /*
5682                  * In case of late initialization, mark the link state as down,
5683                  * immaterial of the actual link state as reported in the
5684                  * port_info.
5685                  */
5686                 state->id_link_state = LINK_STATE_DOWN;
5687                 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
5688                 mac_link_update(state->id_mh, state->id_link_state);
5689                 return (DDI_SUCCESS);
5690         }
5691 
5692         if (state->id_enable_rc) {
5693                 if (state->rc_enable_srq) {
5694                         if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
5695                                 if (ibd_rc_repost_srq_free_list(state) !=
5696                                     IBT_SUCCESS) {
5697                                         err = ENOMEM;
5698                                         goto start_fail;
5699                                 }
5700                         } else {
5701                                 /* Allocate SRQ resource */
5702                                 if (ibd_rc_init_srq_list(state) !=
5703                                     IBT_SUCCESS) {
5704                                         err = ENOMEM;
5705                                         goto start_fail;
5706                                 }
5707                                 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
5708                         }
5709                 }
5710 
5711                 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
5712                         DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
5713                             "failed");
5714                         err = ENOMEM;
5715                         goto start_fail;
5716                 }
5717                 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
5718 
5719                 /* RC: begin to listen only after everything is available */
5720                 if (ibd_rc_listen(state) != IBT_SUCCESS) {
5721                         DPRINT(10, "ibd_start: ibd_rc_listen() failed");
5722                         err = EINVAL;
5723                         goto start_fail;
5724                 }
5725                 state->id_mac_state |= IBD_DRV_RC_LISTEN;
5726         }
5727 
5728         /*
5729          * Indicate link status to GLDv3 and higher layers. By default,
5730          * we assume we are in up state (which must have been true at
5731          * least at the time the broadcast mcg's were probed); if there
5732          * were any up/down transitions till the time we come here, the
5733          * async handler will have updated last known state, which we
5734          * use to tell GLDv3. The async handler will not send any
5735          * notifications to GLDv3 till we reach here in the initialization
5736          * sequence.
5737          */
5738         mac_link_update(state->id_mh, state->id_link_state);
5739         state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
5740         state->id_mac_state |= IBD_DRV_STARTED;
5741 
5742         /* Start timer after everything is ready */
5743         if (state->id_enable_rc) {
5744                 mutex_enter(&state->rc_timeout_lock);
5745                 state->rc_timeout_start = B_TRUE;
5746                 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
5747                     SEC_TO_TICK(ibd_rc_conn_timeout));
5748                 mutex_exit(&state->rc_timeout_lock);
5749                 state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
5750         }
5751 
5752         return (DDI_SUCCESS);
5753 
5754 start_fail:
5755         /*
5756          * If we ran into a problem during ibd_start() and ran into
5757          * some other problem during undoing our partial work, we can't
5758          * do anything about it.  Ignore any errors we might get from
5759          * ibd_undo_start() and just return the original error we got.
5760          */
5761         (void) ibd_undo_start(state, LINK_STATE_DOWN);
5762         return (err);
5763 }
5764 
5765 /*
5766  * GLDv3 entry point to stop hardware from receiving packets.
5767  */
5768 /*ARGSUSED*/
5769 static void
5770 ibd_m_stop(void *arg)
5771 {
5772         ibd_state_t *state = (ibd_state_t *)arg;
5773 
5774         if (state->id_type == IBD_PORT_DRIVER)
5775                 return;
5776 
5777         ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5778 
5779         (void) ibd_undo_start(state, state->id_link_state);
5780 
5781         ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5782 }
5783 
5784 /*
5785  * GLDv3 entry point to modify device's mac address. We do not
5786  * allow address modifications.
5787  */
5788 static int
5789 ibd_m_unicst(void *arg, const uint8_t *macaddr)
5790 {
5791         ibd_state_t *state = arg;
5792 
5793         if (state->id_type == IBD_PORT_DRIVER)
5794                 return (EINVAL);
5795 
5796         /*
5797          * Don't bother even comparing the macaddr if we haven't
5798          * completed ibd_m_start().
5799          */
5800         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5801                 return (0);
5802 
5803         if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
5804                 return (0);
5805         else
5806                 return (EINVAL);
5807 }
5808 
5809 /*
5810  * The blocking part of the IBA join/leave operations are done out
5811  * of here on the async thread.
5812  */
5813 static void
5814 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
5815 {
5816         DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
5817             "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
5818 
5819         if (op == IBD_ASYNC_JOIN) {
5820                 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
5821                         ibd_print_warn(state, "Join multicast group failed :"
5822                         "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5823                 }
5824         } else {
5825                 /*
5826                  * Here, we must search for the proper mcg_info and
5827                  * use that to leave the group.
5828                  */
5829                 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
5830         }
5831 }
5832 
5833 /*
5834  * GLDv3 entry point for multicast enable/disable requests.
5835  * This function queues the operation to the async thread and
5836  * return success for a valid multicast address.
5837  */
5838 static int
5839 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
5840 {
5841         ibd_state_t *state = (ibd_state_t *)arg;
5842         ipoib_mac_t maddr, *mcast;
5843         ib_gid_t mgid;
5844         ibd_req_t *req;
5845 
5846         if (state->id_type == IBD_PORT_DRIVER)
5847                 return (EINVAL);
5848 
5849         /*
5850          * If we haven't completed ibd_m_start(), async thread wouldn't
5851          * have been started and id_bcaddr wouldn't be set, so there's
5852          * no point in continuing.
5853          */
5854         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5855                 return (0);
5856 
5857         /*
5858          * The incoming multicast address might not be aligned properly
5859          * on a 4 byte boundary to be considered an ipoib_mac_t. We force
5860          * it to look like one though, to get the offsets of the mc gid,
5861          * since we know we are not going to dereference any values with
5862          * the ipoib_mac_t pointer.
5863          */
5864         bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
5865         mcast = &maddr;
5866 
5867         /*
5868          * Check validity of MCG address. We could additionally check
5869          * that a enable/disable is not being issued on the "broadcast"
5870          * mcg, but since this operation is only invokable by privileged
5871          * programs anyway, we allow the flexibility to those dlpi apps.
5872          * Note that we do not validate the "scope" of the IBA mcg.
5873          */
5874         if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
5875                 return (EINVAL);
5876 
5877         /*
5878          * fill in multicast pkey and scope
5879          */
5880         IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
5881 
5882         /*
5883          * If someone is trying to JOIN/LEAVE the broadcast group, we do
5884          * nothing (i.e. we stay JOINed to the broadcast group done in
5885          * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
5886          * requires to be joined to broadcast groups at all times.
5887          * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
5888          * depends on this.
5889          */
5890         if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5891                 return (0);
5892 
5893         ibd_n2h_gid(mcast, &mgid);
5894         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5895         if (req == NULL)
5896                 return (ENOMEM);
5897 
5898         req->rq_gid = mgid;
5899 
5900         if (add) {
5901                 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
5902                     mgid.gid_prefix, mgid.gid_guid);
5903                 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
5904         } else {
5905                 DPRINT(1, "ibd_m_multicst : unset_multicast : "
5906                     "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5907                 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
5908         }
5909         return (0);
5910 }
5911 
5912 /*
5913  * The blocking part of the IBA promiscuous operations are done
5914  * out of here on the async thread. The dlpireq parameter indicates
5915  * whether this invocation is due to a dlpi request or due to
5916  * a port up/down event.
5917  */
5918 static void
5919 ibd_async_unsetprom(ibd_state_t *state)
5920 {
5921         ibd_mce_t *mce = list_head(&state->id_mc_non);
5922         ib_gid_t mgid;
5923 
5924         DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
5925 
5926         while (mce != NULL) {
5927                 mgid = mce->mc_info.mc_adds_vect.av_dgid;
5928                 mce = list_next(&state->id_mc_non, mce);
5929                 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
5930         }
5931         state->id_prom_op = IBD_OP_NOTSTARTED;
5932 }
5933 
5934 /*
5935  * The blocking part of the IBA promiscuous operations are done
5936  * out of here on the async thread. The dlpireq parameter indicates
5937  * whether this invocation is due to a dlpi request or due to
5938  * a port up/down event.
5939  */
5940 static void
5941 ibd_async_setprom(ibd_state_t *state)
5942 {
5943         ibt_mcg_attr_t mcg_attr;
5944         ibt_mcg_info_t *mcg_info;
5945         ib_gid_t mgid;
5946         uint_t numg;
5947         int i;
5948         char ret = IBD_OP_COMPLETED;
5949 
5950         DPRINT(2, "ibd_async_setprom : async_set_promisc");
5951 
5952         /*
5953          * Obtain all active MC groups on the IB fabric with
5954          * specified criteria (scope + Pkey + Qkey + mtu).
5955          */
5956         bzero(&mcg_attr, sizeof (mcg_attr));
5957         mcg_attr.mc_pkey = state->id_pkey;
5958         mcg_attr.mc_scope = state->id_scope;
5959         mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
5960         mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
5961         mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
5962         if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
5963             IBT_SUCCESS) {
5964                 ibd_print_warn(state, "Could not get list of IBA multicast "
5965                     "groups");
5966                 ret = IBD_OP_ERRORED;
5967                 goto done;
5968         }
5969 
5970         /*
5971          * Iterate over the returned mcg's and join as NonMember
5972          * to the IP mcg's.
5973          */
5974         for (i = 0; i < numg; i++) {
5975                 /*
5976                  * Do a NonMember JOIN on the MC group.
5977                  */
5978                 mgid = mcg_info[i].mc_adds_vect.av_dgid;
5979                 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
5980                         ibd_print_warn(state, "IBA promiscuous mode missed "
5981                             "multicast gid %016llx:%016llx",
5982                             (u_longlong_t)mgid.gid_prefix,
5983                             (u_longlong_t)mgid.gid_guid);
5984         }
5985 
5986         ibt_free_mcg_info(mcg_info, numg);
5987         DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
5988 done:
5989         state->id_prom_op = ret;
5990 }
5991 
5992 /*
5993  * GLDv3 entry point for multicast promiscuous enable/disable requests.
5994  * GLDv3 assumes phys state receives more packets than multi state,
5995  * which is not true for IPoIB. Thus, treat the multi and phys
5996  * promiscuous states the same way to work with GLDv3's assumption.
5997  */
5998 static int
5999 ibd_m_promisc(void *arg, boolean_t on)
6000 {
6001         ibd_state_t *state = (ibd_state_t *)arg;
6002         ibd_req_t *req;
6003 
6004         if (state->id_type == IBD_PORT_DRIVER)
6005                 return (EINVAL);
6006 
6007         /*
6008          * Async thread wouldn't have been started if we haven't
6009          * passed ibd_m_start()
6010          */
6011         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6012                 return (0);
6013 
6014         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6015         if (req == NULL)
6016                 return (ENOMEM);
6017         if (on) {
6018                 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6019                 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6020         } else {
6021                 DPRINT(1, "ibd_m_promisc : unset_promisc");
6022                 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6023         }
6024 
6025         return (0);
6026 }
6027 
6028 /*
6029  * GLDv3 entry point for gathering statistics.
6030  */
6031 static int
6032 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6033 {
6034         ibd_state_t *state = (ibd_state_t *)arg;
6035 
6036         switch (stat) {
6037         case MAC_STAT_IFSPEED:
6038                 *val = state->id_link_speed;
6039                 break;
6040         case MAC_STAT_MULTIRCV:
6041                 *val = state->id_multi_rcv;
6042                 break;
6043         case MAC_STAT_BRDCSTRCV:
6044                 *val = state->id_brd_rcv;
6045                 break;
6046         case MAC_STAT_MULTIXMT:
6047                 *val = state->id_multi_xmt;
6048                 break;
6049         case MAC_STAT_BRDCSTXMT:
6050                 *val = state->id_brd_xmt;
6051                 break;
6052         case MAC_STAT_RBYTES:
6053                 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6054                     + state->rc_rcv_copy_byte;
6055                 break;
6056         case MAC_STAT_IPACKETS:
6057                 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6058                     + state->rc_rcv_copy_pkt;
6059                 break;
6060         case MAC_STAT_OBYTES:
6061                 *val = state->id_xmt_bytes + state->rc_xmt_bytes;
6062                 break;
6063         case MAC_STAT_OPACKETS:
6064                 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6065                     state->rc_xmt_fragmented_pkt +
6066                     state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6067                 break;
6068         case MAC_STAT_OERRORS:
6069                 *val = state->id_ah_error;   /* failed AH translation */
6070                 break;
6071         case MAC_STAT_IERRORS:
6072                 *val = 0;
6073                 break;
6074         case MAC_STAT_NOXMTBUF:
6075                 *val = state->id_tx_short + state->rc_swqe_short +
6076                     state->rc_xmt_buf_short;
6077                 break;
6078         case MAC_STAT_NORCVBUF:
6079         default:
6080                 return (ENOTSUP);
6081         }
6082 
6083         return (0);
6084 }
6085 
6086 static void
6087 ibd_async_txsched(ibd_state_t *state)
6088 {
6089         ibd_resume_transmission(state);
6090 }
6091 
6092 static void
6093 ibd_resume_transmission(ibd_state_t *state)
6094 {
6095         int flag;
6096         int met_thresh = 0;
6097         int thresh = 0;
6098         int ret = -1;
6099 
6100         mutex_enter(&state->id_sched_lock);
6101         if (state->id_sched_needed & IBD_RSRC_SWQE) {
6102                 mutex_enter(&state->id_tx_list.dl_mutex);
6103                 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6104                 met_thresh = state->id_tx_list.dl_cnt +
6105                     state->id_tx_rel_list.dl_cnt;
6106                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6107                 mutex_exit(&state->id_tx_list.dl_mutex);
6108                 thresh = IBD_FREE_SWQES_THRESH;
6109                 flag = IBD_RSRC_SWQE;
6110         } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6111                 ASSERT(state->id_lso != NULL);
6112                 mutex_enter(&state->id_lso_lock);
6113                 met_thresh = state->id_lso->bkt_nfree;
6114                 thresh = IBD_FREE_LSOS_THRESH;
6115                 mutex_exit(&state->id_lso_lock);
6116                 flag = IBD_RSRC_LSOBUF;
6117                 if (met_thresh > thresh)
6118                         state->id_sched_lso_cnt++;
6119         }
6120         if (met_thresh > thresh) {
6121                 state->id_sched_needed &= ~flag;
6122                 state->id_sched_cnt++;
6123                 ret = 0;
6124         }
6125         mutex_exit(&state->id_sched_lock);
6126 
6127         if (ret == 0)
6128                 mac_tx_update(state->id_mh);
6129 }
6130 
6131 /*
6132  * Release the send wqe back into free list.
6133  */
6134 static void
6135 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6136 {
6137         /*
6138          * Add back on Tx list for reuse.
6139          */
6140         ASSERT(tail->swqe_next == NULL);
6141         mutex_enter(&state->id_tx_rel_list.dl_mutex);
6142         state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6143         tail->swqe_next = state->id_tx_rel_list.dl_head;
6144         state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6145         state->id_tx_rel_list.dl_cnt += n;
6146         mutex_exit(&state->id_tx_rel_list.dl_mutex);
6147 }
6148 
6149 /*
6150  * Acquire a send wqe from free list.
6151  * Returns error number and send wqe pointer.
6152  */
6153 static ibd_swqe_t *
6154 ibd_acquire_swqe(ibd_state_t *state)
6155 {
6156         ibd_swqe_t *wqe;
6157 
6158         mutex_enter(&state->id_tx_rel_list.dl_mutex);
6159         if (state->id_tx_rel_list.dl_head != NULL) {
6160                 /* transfer id_tx_rel_list to id_tx_list */
6161                 state->id_tx_list.dl_head =
6162                     state->id_tx_rel_list.dl_head;
6163                 state->id_tx_list.dl_cnt =
6164                     state->id_tx_rel_list.dl_cnt;
6165                 state->id_tx_list.dl_pending_sends = B_FALSE;
6166 
6167                 /* clear id_tx_rel_list */
6168                 state->id_tx_rel_list.dl_head = NULL;
6169                 state->id_tx_rel_list.dl_cnt = 0;
6170                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6171 
6172                 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6173                 state->id_tx_list.dl_cnt -= 1;
6174                 state->id_tx_list.dl_head = wqe->swqe_next;
6175         } else {        /* no free swqe */
6176                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6177                 state->id_tx_list.dl_pending_sends = B_TRUE;
6178                 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6179                 state->id_tx_short++;
6180                 wqe = NULL;
6181         }
6182         return (wqe);
6183 }
6184 
6185 static int
6186 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6187     ibt_ud_dest_hdl_t ud_dest)
6188 {
6189         mblk_t  *nmp;
6190         int iph_len, tcph_len;
6191         ibt_wr_lso_t *lso;
6192         uintptr_t ip_start, tcp_start;
6193         uint8_t *dst;
6194         uint_t pending, mblen;
6195 
6196         /*
6197          * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6198          * we need to adjust it here for lso.
6199          */
6200         lso = &(node->w_swr.wr.ud_lso);
6201         lso->lso_ud_dest = ud_dest;
6202         lso->lso_mss = mss;
6203 
6204         /*
6205          * Calculate the LSO header size and set it in the UD LSO structure.
6206          * Note that the only assumption we make is that each of the IPoIB,
6207          * IP and TCP headers will be contained in a single mblk fragment;
6208          * together, the headers may span multiple mblk fragments.
6209          */
6210         nmp = mp;
6211         ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6212         if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6213                 ip_start = (uintptr_t)nmp->b_cont->b_rptr
6214                     + (ip_start - (uintptr_t)(nmp->b_wptr));
6215                 nmp = nmp->b_cont;
6216 
6217         }
6218         iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6219 
6220         tcp_start = ip_start + iph_len;
6221         if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6222                 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6223                     + (tcp_start - (uintptr_t)(nmp->b_wptr));
6224                 nmp = nmp->b_cont;
6225         }
6226         tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6227         lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6228 
6229         /*
6230          * If the lso header fits entirely within a single mblk fragment,
6231          * we'll avoid an additional copy of the lso header here and just
6232          * pass the b_rptr of the mblk directly.
6233          *
6234          * If this isn't true, we'd have to allocate for it explicitly.
6235          */
6236         if (lso->lso_hdr_sz <= MBLKL(mp)) {
6237                 lso->lso_hdr = mp->b_rptr;
6238         } else {
6239                 /* On work completion, remember to free this allocated hdr */
6240                 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6241                 if (lso->lso_hdr == NULL) {
6242                         DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6243                             "sz = %d", lso->lso_hdr_sz);
6244                         lso->lso_hdr_sz = 0;
6245                         lso->lso_mss = 0;
6246                         return (-1);
6247                 }
6248         }
6249 
6250         /*
6251          * Copy in the lso header only if we need to
6252          */
6253         if (lso->lso_hdr != mp->b_rptr) {
6254                 dst = lso->lso_hdr;
6255                 pending = lso->lso_hdr_sz;
6256 
6257                 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6258                         mblen = MBLKL(nmp);
6259                         if (pending > mblen) {
6260                                 bcopy(nmp->b_rptr, dst, mblen);
6261                                 dst += mblen;
6262                                 pending -= mblen;
6263                         } else {
6264                                 bcopy(nmp->b_rptr, dst, pending);
6265                                 break;
6266                         }
6267                 }
6268         }
6269 
6270         return (0);
6271 }
6272 
6273 static void
6274 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6275 {
6276         ibt_wr_lso_t *lso;
6277 
6278         if ((!node) || (!mp))
6279                 return;
6280 
6281         /*
6282          * Free any header space that we might've allocated if we
6283          * did an LSO
6284          */
6285         if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6286                 lso = &(node->w_swr.wr.ud_lso);
6287                 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6288                         kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6289                         lso->lso_hdr = NULL;
6290                         lso->lso_hdr_sz = 0;
6291                 }
6292         }
6293 }
6294 
6295 static void
6296 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6297 {
6298         uint_t          i;
6299         uint_t          num_posted;
6300         uint_t          n_wrs;
6301         ibt_status_t    ibt_status;
6302         ibt_send_wr_t   wrs[IBD_MAX_TX_POST_MULTIPLE];
6303         ibd_swqe_t      *tx_head, *elem;
6304         ibd_swqe_t      *nodes[IBD_MAX_TX_POST_MULTIPLE];
6305 
6306         /* post the one request, then check for more */
6307         ibt_status = ibt_post_send(state->id_chnl_hdl,
6308             &node->w_swr, 1, NULL);
6309         if (ibt_status != IBT_SUCCESS) {
6310                 ibd_print_warn(state, "ibd_post_send: "
6311                     "posting one wr failed: ret=%d", ibt_status);
6312                 ibd_tx_cleanup(state, node);
6313         }
6314 
6315         tx_head = NULL;
6316         for (;;) {
6317                 if (tx_head == NULL) {
6318                         mutex_enter(&state->id_txpost_lock);
6319                         tx_head = state->id_tx_head;
6320                         if (tx_head == NULL) {
6321                                 state->id_tx_busy = 0;
6322                                 mutex_exit(&state->id_txpost_lock);
6323                                 return;
6324                         }
6325                         state->id_tx_head = NULL;
6326                         mutex_exit(&state->id_txpost_lock);
6327                 }
6328 
6329                 /*
6330                  * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6331                  * at a time if possible, and keep posting them.
6332                  */
6333                 for (n_wrs = 0, elem = tx_head;
6334                     (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6335                     elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6336                         nodes[n_wrs] = elem;
6337                         wrs[n_wrs] = elem->w_swr;
6338                 }
6339                 tx_head = elem;
6340 
6341                 ASSERT(n_wrs != 0);
6342 
6343                 /*
6344                  * If posting fails for some reason, we'll never receive
6345                  * completion intimation, so we'll need to cleanup. But
6346                  * we need to make sure we don't clean up nodes whose
6347                  * wrs have been successfully posted. We assume that the
6348                  * hca driver returns on the first failure to post and
6349                  * therefore the first 'num_posted' entries don't need
6350                  * cleanup here.
6351                  */
6352                 num_posted = 0;
6353                 ibt_status = ibt_post_send(state->id_chnl_hdl,
6354                     wrs, n_wrs, &num_posted);
6355                 if (ibt_status != IBT_SUCCESS) {
6356                         ibd_print_warn(state, "ibd_post_send: "
6357                             "posting multiple wrs failed: "
6358                             "requested=%d, done=%d, ret=%d",
6359                             n_wrs, num_posted, ibt_status);
6360 
6361                         for (i = num_posted; i < n_wrs; i++)
6362                                 ibd_tx_cleanup(state, nodes[i]);
6363                 }
6364         }
6365 }
6366 
6367 static int
6368 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6369     uint_t lsohdr_sz)
6370 {
6371         ibt_wr_ds_t *sgl;
6372         ibt_status_t ibt_status;
6373         mblk_t *nmp;
6374         mblk_t *data_mp;
6375         uchar_t *bufp;
6376         size_t blksize;
6377         size_t skip;
6378         size_t avail;
6379         uint_t pktsize;
6380         uint_t frag_len;
6381         uint_t pending_hdr;
6382         int nmblks;
6383         int i;
6384 
6385         /*
6386          * Let's skip ahead to the data if this is LSO
6387          */
6388         data_mp = mp;
6389         pending_hdr = 0;
6390         if (lsohdr_sz) {
6391                 pending_hdr = lsohdr_sz;
6392                 for (nmp = mp; nmp; nmp = nmp->b_cont) {
6393                         frag_len = nmp->b_wptr - nmp->b_rptr;
6394                         if (frag_len > pending_hdr)
6395                                 break;
6396                         pending_hdr -= frag_len;
6397                 }
6398                 data_mp = nmp;  /* start of data past lso header */
6399                 ASSERT(data_mp != NULL);
6400         }
6401 
6402         /*
6403          * Calculate the size of message data and number of msg blocks
6404          */
6405         pktsize = 0;
6406         for (nmblks = 0, nmp = data_mp; nmp != NULL;
6407             nmp = nmp->b_cont, nmblks++) {
6408                 pktsize += MBLKL(nmp);
6409         }
6410         pktsize -= pending_hdr;
6411 
6412         /*
6413          * We only do ibt_map_mem_iov() if the pktsize is above the
6414          * "copy-threshold", and if the number of mp fragments is less than
6415          * the maximum acceptable.
6416          */
6417         if ((state->id_hca_res_lkey_capab) &&
6418             (pktsize > state->id_ud_tx_copy_thresh) &&
6419             (nmblks < state->id_max_sqseg_hiwm)) {
6420                 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6421                 ibt_iov_attr_t iov_attr;
6422 
6423                 iov_attr.iov_as = NULL;
6424                 iov_attr.iov = iov_arr;
6425                 iov_attr.iov_buf = NULL;
6426                 iov_attr.iov_list_len = nmblks;
6427                 iov_attr.iov_wr_nds = state->id_max_sqseg;
6428                 iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6429                 iov_attr.iov_flags = IBT_IOV_SLEEP;
6430 
6431                 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6432                         iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6433                         iov_arr[i].iov_len = MBLKL(nmp);
6434                         if (i == 0) {
6435                                 iov_arr[i].iov_addr += pending_hdr;
6436                                 iov_arr[i].iov_len -= pending_hdr;
6437                         }
6438                 }
6439 
6440                 node->w_buftype = IBD_WQE_MAPPED;
6441                 node->w_swr.wr_sgl = node->w_sgl;
6442 
6443                 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6444                     (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6445                 if (ibt_status != IBT_SUCCESS) {
6446                         ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6447                             "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6448                         goto ibd_copy_path;
6449                 }
6450 
6451                 return (0);
6452         }
6453 
6454 ibd_copy_path:
6455         if (pktsize <= state->id_tx_buf_sz) {
6456                 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6457                 node->w_swr.wr_nds = 1;
6458                 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6459                 node->w_buftype = IBD_WQE_TXBUF;
6460 
6461                 /*
6462                  * Even though this is the copy path for transfers less than
6463                  * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6464                  * is possible the first data mblk fragment (data_mp) still
6465                  * contains part of the LSO header that we need to skip.
6466                  */
6467                 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6468                 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6469                         blksize = MBLKL(nmp) - pending_hdr;
6470                         bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6471                         bufp += blksize;
6472                         pending_hdr = 0;
6473                 }
6474 
6475                 return (0);
6476         }
6477 
6478         /*
6479          * Copy path for transfers greater than id_tx_buf_sz
6480          */
6481         node->w_swr.wr_sgl = node->w_sgl;
6482         if (ibd_acquire_lsobufs(state, pktsize,
6483             node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6484                 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6485                 return (-1);
6486         }
6487         node->w_buftype = IBD_WQE_LSOBUF;
6488 
6489         /*
6490          * Copy the larger-than-id_tx_buf_sz packet into a set of
6491          * fixed-sized, pre-mapped LSO buffers. Note that we might
6492          * need to skip part of the LSO header in the first fragment
6493          * as before.
6494          */
6495         nmp = data_mp;
6496         skip = pending_hdr;
6497         for (i = 0; i < node->w_swr.wr_nds; i++) {
6498                 sgl = node->w_swr.wr_sgl + i;
6499                 bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6500                 avail = IBD_LSO_BUFSZ;
6501                 while (nmp && avail) {
6502                         blksize = MBLKL(nmp) - skip;
6503                         if (blksize > avail) {
6504                                 bcopy(nmp->b_rptr + skip, bufp, avail);
6505                                 skip += avail;
6506                                 avail = 0;
6507                         } else {
6508                                 bcopy(nmp->b_rptr + skip, bufp, blksize);
6509                                 skip = 0;
6510                                 avail -= blksize;
6511                                 bufp += blksize;
6512                                 nmp = nmp->b_cont;
6513                         }
6514                 }
6515         }
6516 
6517         return (0);
6518 }
6519 
6520 /*
6521  * Schedule a completion queue polling to reap the resource we're
6522  * short on.  If we implement the change to reap tx completions
6523  * in a separate thread, we'll need to wake up that thread here.
6524  */
6525 static int
6526 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6527 {
6528         ibd_req_t *req;
6529 
6530         mutex_enter(&state->id_sched_lock);
6531         state->id_sched_needed |= resource_type;
6532         mutex_exit(&state->id_sched_lock);
6533 
6534         /*
6535          * If we are asked to queue a work entry, we need to do it
6536          */
6537         if (q_flag) {
6538                 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6539                 if (req == NULL)
6540                         return (-1);
6541 
6542                 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6543         }
6544 
6545         return (0);
6546 }
6547 
6548 /*
6549  * The passed in packet has this format:
6550  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6551  */
6552 static boolean_t
6553 ibd_send(ibd_state_t *state, mblk_t *mp)
6554 {
6555         ibd_ace_t *ace;
6556         ibd_swqe_t *node;
6557         ipoib_mac_t *dest;
6558         ib_header_info_t *ipibp;
6559         ip6_t *ip6h;
6560         uint_t pktsize;
6561         uint32_t mss;
6562         uint32_t hckflags;
6563         uint32_t lsoflags = 0;
6564         uint_t lsohdr_sz = 0;
6565         int ret, len;
6566         boolean_t dofree = B_FALSE;
6567         boolean_t rc;
6568         /* if (rc_chan == NULL) send by UD; else send by RC; */
6569         ibd_rc_chan_t *rc_chan;
6570         int nmblks;
6571         mblk_t *nmp;
6572 
6573         /*
6574          * If we aren't done with the device initialization and start,
6575          * we shouldn't be here.
6576          */
6577         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6578                 return (B_FALSE);
6579 
6580         /*
6581          * Obtain an address handle for the destination.
6582          */
6583         ipibp = (ib_header_info_t *)mp->b_rptr;
6584         dest = (ipoib_mac_t *)&ipibp->ib_dst;
6585         if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6586                 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6587 
6588         rc_chan = NULL;
6589         ace = ibd_acache_lookup(state, dest, &ret, 1);
6590         if (state->id_enable_rc && (ace != NULL) &&
6591             (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6592                 if (ace->ac_chan == NULL) {
6593                         state->rc_null_conn++;
6594                 } else {
6595                         if (ace->ac_chan->chan_state ==
6596                             IBD_RC_STATE_ACT_ESTAB) {
6597                                 rc_chan = ace->ac_chan;
6598                                 rc_chan->is_used = B_TRUE;
6599                                 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6600                                 node = WQE_TO_SWQE(
6601                                     rc_chan->tx_wqe_list.dl_head);
6602                                 if (node != NULL) {
6603                                         rc_chan->tx_wqe_list.dl_cnt -= 1;
6604                                         rc_chan->tx_wqe_list.dl_head =
6605                                             node->swqe_next;
6606                                 } else {
6607                                         node = ibd_rc_acquire_swqes(rc_chan);
6608                                 }
6609                                 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6610 
6611                                 if (node == NULL) {
6612                                         state->rc_swqe_short++;
6613                                         mutex_enter(&state->id_sched_lock);
6614                                         state->id_sched_needed |=
6615                                             IBD_RSRC_RC_SWQE;
6616                                         mutex_exit(&state->id_sched_lock);
6617                                         ibd_dec_ref_ace(state, ace);
6618                                         return (B_FALSE);
6619                                 }
6620                         } else {
6621                                 state->rc_no_estab_conn++;
6622                         }
6623                 }
6624         }
6625 
6626         if (rc_chan == NULL) {
6627                 mutex_enter(&state->id_tx_list.dl_mutex);
6628                 node = WQE_TO_SWQE(state->id_tx_list.dl_head);
6629                 if (node != NULL) {
6630                         state->id_tx_list.dl_cnt -= 1;
6631                         state->id_tx_list.dl_head = node->swqe_next;
6632                 } else {
6633                         node = ibd_acquire_swqe(state);
6634                 }
6635                 mutex_exit(&state->id_tx_list.dl_mutex);
6636                 if (node == NULL) {
6637                         /*
6638                          * If we don't have an swqe available, schedule a
6639                          * transmit completion queue cleanup and hold off on
6640                          * sending more packets until we have some free swqes
6641                          */
6642                         if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
6643                                 if (ace != NULL) {
6644                                         ibd_dec_ref_ace(state, ace);
6645                                 }
6646                                 return (B_FALSE);
6647                         }
6648 
6649                         /*
6650                          * If a poll cannot be scheduled, we have no choice but
6651                          * to drop this packet
6652                          */
6653                         ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
6654                         if (ace != NULL) {
6655                                 ibd_dec_ref_ace(state, ace);
6656                         }
6657                         return (B_TRUE);
6658                 }
6659         }
6660 
6661         /*
6662          * Initialize the commonly used fields in swqe to NULL to protect
6663          * against ibd_tx_cleanup accidentally misinterpreting these on a
6664          * failure.
6665          */
6666         node->swqe_im_mblk = NULL;
6667         node->w_swr.wr_nds = 0;
6668         node->w_swr.wr_sgl = NULL;
6669         node->w_swr.wr_opcode = IBT_WRC_SEND;
6670 
6671         /*
6672          * Calculate the size of message data and number of msg blocks
6673          */
6674         pktsize = 0;
6675         for (nmblks = 0, nmp = mp; nmp != NULL;
6676             nmp = nmp->b_cont, nmblks++) {
6677                 pktsize += MBLKL(nmp);
6678         }
6679 
6680         if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6681                 atomic_inc_64(&state->id_brd_xmt);
6682         else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6683                 atomic_inc_64(&state->id_multi_xmt);
6684 
6685         if (ace != NULL) {
6686                 node->w_ahandle = ace;
6687                 node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
6688         } else {
6689                 DPRINT(5,
6690                     "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
6691                     ((ret == EFAULT) ? "failed" : "queued"),
6692                     htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
6693                     htonl(dest->ipoib_gidpref[1]),
6694                     htonl(dest->ipoib_gidsuff[0]),
6695                     htonl(dest->ipoib_gidsuff[1]));
6696                 state->rc_ace_not_found++;
6697                 node->w_ahandle = NULL;
6698 
6699                 /*
6700                  * Here if ibd_acache_lookup() returns EFAULT, it means ibd
6701                  * can not find a path for the specific dest address. We
6702                  * should get rid of this kind of packet.  We also should get
6703                  * rid of the packet if we cannot schedule a poll via the
6704                  * async thread.  For the normal case, ibd will return the
6705                  * packet to upper layer and wait for AH creating.
6706                  *
6707                  * Note that we always queue a work slot entry for the async
6708                  * thread when we fail AH lookup (even in intr mode); this is
6709                  * due to the convoluted way the code currently looks for AH.
6710                  */
6711                 if (ret == EFAULT) {
6712                         dofree = B_TRUE;
6713                         rc = B_TRUE;
6714                 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
6715                         dofree = B_TRUE;
6716                         rc = B_TRUE;
6717                 } else {
6718                         dofree = B_FALSE;
6719                         rc = B_FALSE;
6720                 }
6721                 goto ibd_send_fail;
6722         }
6723 
6724         /*
6725          * For ND6 packets, padding is at the front of the source lladdr.
6726          * Insert the padding at front.
6727          */
6728         if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
6729                 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
6730                         if (!pullupmsg(mp, IPV6_HDR_LEN +
6731                             sizeof (ib_header_info_t))) {
6732                                 DPRINT(10, "ibd_send: pullupmsg failure ");
6733                                 dofree = B_TRUE;
6734                                 rc = B_TRUE;
6735                                 goto ibd_send_fail;
6736                         }
6737                         ipibp = (ib_header_info_t *)mp->b_rptr;
6738                 }
6739                 ip6h = (ip6_t *)((uchar_t *)ipibp +
6740                     sizeof (ib_header_info_t));
6741                 len = ntohs(ip6h->ip6_plen);
6742                 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6743                         mblk_t  *pad;
6744 
6745                         pad = allocb(4, 0);
6746                         pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
6747                         linkb(mp, pad);
6748                         if (MBLKL(mp) < sizeof (ib_header_info_t) +
6749                             IPV6_HDR_LEN + len + 4) {
6750                                 if (!pullupmsg(mp, sizeof (ib_header_info_t) +
6751                                     IPV6_HDR_LEN + len + 4)) {
6752                                         DPRINT(10, "ibd_send: pullupmsg "
6753                                             "failure ");
6754                                         dofree = B_TRUE;
6755                                         rc = B_TRUE;
6756                                         goto ibd_send_fail;
6757                                 }
6758                                 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6759                                     sizeof (ib_header_info_t));
6760                         }
6761 
6762                         /* LINTED: E_CONSTANT_CONDITION */
6763                         IBD_PAD_NSNA(ip6h, len, IBD_SEND);
6764                 }
6765         }
6766 
6767         ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
6768         mp->b_rptr += sizeof (ib_addrs_t);
6769         pktsize -= sizeof (ib_addrs_t);
6770 
6771         if (rc_chan) {  /* send in RC mode */
6772                 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6773                 ibt_iov_attr_t iov_attr;
6774                 uint_t          i;
6775                 size_t  blksize;
6776                 uchar_t *bufp;
6777                 ibd_rc_tx_largebuf_t *lbufp;
6778 
6779                 atomic_add_64(&state->rc_xmt_bytes, pktsize);
6780 
6781                 /*
6782                  * Upper layer does Tx checksum, we don't need do any
6783                  * checksum here.
6784                  */
6785                 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
6786 
6787                 /*
6788                  * We only do ibt_map_mem_iov() if the pktsize is above
6789                  * the "copy-threshold", and if the number of mp
6790                  * fragments is less than the maximum acceptable.
6791                  */
6792                 if (pktsize <= state->id_rc_tx_copy_thresh) {
6793                         atomic_inc_64(&state->rc_xmt_small_pkt);
6794                         /*
6795                          * Only process unicast packet in Reliable Connected
6796                          * mode.
6797                          */
6798                         node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6799                         node->w_swr.wr_nds = 1;
6800                         node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6801                         node->w_buftype = IBD_WQE_TXBUF;
6802 
6803                         bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6804                         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6805                                 blksize = MBLKL(nmp);
6806                                 bcopy(nmp->b_rptr, bufp, blksize);
6807                                 bufp += blksize;
6808                         }
6809                         freemsg(mp);
6810                         ASSERT(node->swqe_im_mblk == NULL);
6811                 } else {
6812                         if ((state->rc_enable_iov_map) &&
6813                             (nmblks < state->rc_max_sqseg_hiwm)) {
6814 
6815                                 /* do ibt_map_mem_iov() */
6816                                 iov_attr.iov_as = NULL;
6817                                 iov_attr.iov = iov_arr;
6818                                 iov_attr.iov_buf = NULL;
6819                                 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
6820                                 iov_attr.iov_lso_hdr_sz = 0;
6821                                 iov_attr.iov_flags = IBT_IOV_SLEEP;
6822 
6823                                 i = 0;
6824                                 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6825                                         iov_arr[i].iov_len = MBLKL(nmp);
6826                                         if (iov_arr[i].iov_len != 0) {
6827                                                 iov_arr[i].iov_addr = (caddr_t)
6828                                                     (void *)nmp->b_rptr;
6829                                                 i++;
6830                                         }
6831                                 }
6832                                 iov_attr.iov_list_len = i;
6833                                 node->w_swr.wr_sgl = node->w_sgl;
6834 
6835                                 ret = ibt_map_mem_iov(state->id_hca_hdl,
6836                                     &iov_attr, (ibt_all_wr_t *)&node->w_swr,
6837                                     &node->w_mi_hdl);
6838                                 if (ret != IBT_SUCCESS) {
6839                                         atomic_inc_64(
6840                                             &state->rc_xmt_map_fail_pkt);
6841                                         DPRINT(30, "ibd_send: ibt_map_mem_iov("
6842                                             ") failed, nmblks=%d, real_nmblks"
6843                                             "=%d, ret=0x%x", nmblks, i, ret);
6844                                         goto ibd_rc_large_copy;
6845                                 }
6846 
6847                                 atomic_inc_64(&state->rc_xmt_map_succ_pkt);
6848                                 node->w_buftype = IBD_WQE_MAPPED;
6849                                 node->swqe_im_mblk = mp;
6850                         } else {
6851                                 atomic_inc_64(&state->rc_xmt_fragmented_pkt);
6852 ibd_rc_large_copy:
6853                                 mutex_enter(&state->rc_tx_large_bufs_lock);
6854                                 if (state->rc_tx_largebuf_nfree == 0) {
6855                                         state->rc_xmt_buf_short++;
6856                                         mutex_exit
6857                                             (&state->rc_tx_large_bufs_lock);
6858                                         mutex_enter(&state->id_sched_lock);
6859                                         state->id_sched_needed |=
6860                                             IBD_RSRC_RC_TX_LARGEBUF;
6861                                         mutex_exit(&state->id_sched_lock);
6862                                         dofree = B_FALSE;
6863                                         rc = B_FALSE;
6864                                         /*
6865                                          * If we don't have Tx large bufs,
6866                                          * return failure. node->w_buftype
6867                                          * should not be IBD_WQE_RC_COPYBUF,
6868                                          * otherwise it will cause problem
6869                                          * in ibd_rc_tx_cleanup()
6870                                          */
6871                                         node->w_buftype = IBD_WQE_TXBUF;
6872                                         goto ibd_send_fail;
6873                                 }
6874 
6875                                 lbufp = state->rc_tx_largebuf_free_head;
6876                                 ASSERT(lbufp->lb_buf != NULL);
6877                                 state->rc_tx_largebuf_free_head =
6878                                     lbufp->lb_next;
6879                                 lbufp->lb_next = NULL;
6880                                 /* Update nfree count */
6881                                 state->rc_tx_largebuf_nfree --;
6882                                 mutex_exit(&state->rc_tx_large_bufs_lock);
6883                                 bufp = lbufp->lb_buf;
6884                                 node->w_sgl[0].ds_va =
6885                                     (ib_vaddr_t)(uintptr_t)bufp;
6886                                 node->w_sgl[0].ds_key =
6887                                     state->rc_tx_mr_desc.md_lkey;
6888                                 node->w_sgl[0].ds_len = pktsize;
6889                                 node->w_swr.wr_sgl = node->w_sgl;
6890                                 node->w_swr.wr_nds = 1;
6891                                 node->w_buftype = IBD_WQE_RC_COPYBUF;
6892                                 node->w_rc_tx_largebuf = lbufp;
6893 
6894                                 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6895                                         blksize = MBLKL(nmp);
6896                                         if (blksize != 0) {
6897                                                 bcopy(nmp->b_rptr, bufp,
6898                                                     blksize);
6899                                                 bufp += blksize;
6900                                         }
6901                                 }
6902                                 freemsg(mp);
6903                                 ASSERT(node->swqe_im_mblk == NULL);
6904                         }
6905                 }
6906 
6907                 node->swqe_next = NULL;
6908                 mutex_enter(&rc_chan->tx_post_lock);
6909                 if (rc_chan->tx_busy) {
6910                         if (rc_chan->tx_head) {
6911                                 rc_chan->tx_tail->swqe_next =
6912                                     SWQE_TO_WQE(node);
6913                         } else {
6914                                 rc_chan->tx_head = node;
6915                         }
6916                         rc_chan->tx_tail = node;
6917                         mutex_exit(&rc_chan->tx_post_lock);
6918                 } else {
6919                         rc_chan->tx_busy = 1;
6920                         mutex_exit(&rc_chan->tx_post_lock);
6921                         ibd_rc_post_send(rc_chan, node);
6922                 }
6923 
6924                 return (B_TRUE);
6925         } /* send by RC */
6926 
6927         if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
6928                 /*
6929                  * Too long pktsize. The packet size from GLD should <=
6930                  * state->id_mtu + sizeof (ib_addrs_t)
6931                  */
6932                 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
6933                         ibd_req_t *req;
6934 
6935                         mutex_enter(&ace->tx_too_big_mutex);
6936                         if (ace->tx_too_big_ongoing) {
6937                                 mutex_exit(&ace->tx_too_big_mutex);
6938                                 state->rc_xmt_reenter_too_long_pkt++;
6939                                 dofree = B_TRUE;
6940                         } else {
6941                                 ace->tx_too_big_ongoing = B_TRUE;
6942                                 mutex_exit(&ace->tx_too_big_mutex);
6943                                 state->rc_xmt_icmp_too_long_pkt++;
6944 
6945                                 req = kmem_cache_alloc(state->id_req_kmc,
6946                                     KM_NOSLEEP);
6947                                 if (req == NULL) {
6948                                         ibd_print_warn(state, "ibd_send: alloc "
6949                                             "ibd_req_t fail");
6950                                         /* Drop it. */
6951                                         dofree = B_TRUE;
6952                                 } else {
6953                                         req->rq_ptr = mp;
6954                                         req->rq_ptr2 = ace;
6955                                         ibd_queue_work_slot(state, req,
6956                                             IBD_ASYNC_RC_TOO_BIG);
6957                                         dofree = B_FALSE;
6958                                 }
6959                         }
6960                 } else {
6961                         ibd_print_warn(state, "Reliable Connected mode is on. "
6962                             "Multicast packet length %d > %d is too long to "
6963                             "send packet (%d > %d), drop it",
6964                             pktsize, state->id_mtu);
6965                         state->rc_xmt_drop_too_long_pkt++;
6966                         /* Drop it. */
6967                         dofree = B_TRUE;
6968                 }
6969                 rc = B_TRUE;
6970                 goto ibd_send_fail;
6971         }
6972 
6973         atomic_add_64(&state->id_xmt_bytes, pktsize);
6974         atomic_inc_64(&state->id_xmt_pkt);
6975 
6976         /*
6977          * Do LSO and checksum related work here.  For LSO send, adjust the
6978          * ud destination, the opcode and the LSO header information to the
6979          * work request.
6980          */
6981         mac_lso_get(mp, &mss, &lsoflags);
6982         if ((lsoflags & HW_LSO) != HW_LSO) {
6983                 node->w_swr.wr_opcode = IBT_WRC_SEND;
6984                 lsohdr_sz = 0;
6985         } else {
6986                 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
6987                         /*
6988                          * The routine can only fail if there's no memory; we
6989                          * can only drop the packet if this happens
6990                          */
6991                         ibd_print_warn(state,
6992                             "ibd_send: no memory, lso posting failed");
6993                         dofree = B_TRUE;
6994                         rc = B_TRUE;
6995                         goto ibd_send_fail;
6996                 }
6997 
6998                 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
6999                 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7000         }
7001 
7002         mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7003         if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7004                 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7005         else
7006                 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7007 
7008         /*
7009          * Prepare the sgl for posting; the routine can only fail if there's
7010          * no lso buf available for posting. If this is the case, we should
7011          * probably resched for lso bufs to become available and then try again.
7012          */
7013         if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7014                 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7015                         dofree = B_TRUE;
7016                         rc = B_TRUE;
7017                 } else {
7018                         dofree = B_FALSE;
7019                         rc = B_FALSE;
7020                 }
7021                 goto ibd_send_fail;
7022         }
7023         node->swqe_im_mblk = mp;
7024 
7025         /*
7026          * Queue the wqe to hardware; since we can now simply queue a
7027          * post instead of doing it serially, we cannot assume anything
7028          * about the 'node' after ibd_post_send() returns.
7029          */
7030         node->swqe_next = NULL;
7031 
7032         mutex_enter(&state->id_txpost_lock);
7033         if (state->id_tx_busy) {
7034                 if (state->id_tx_head) {
7035                         state->id_tx_tail->swqe_next =
7036                             SWQE_TO_WQE(node);
7037                 } else {
7038                         state->id_tx_head = node;
7039                 }
7040                 state->id_tx_tail = node;
7041                 mutex_exit(&state->id_txpost_lock);
7042         } else {
7043                 state->id_tx_busy = 1;
7044                 mutex_exit(&state->id_txpost_lock);
7045                 ibd_post_send(state, node);
7046         }
7047 
7048         return (B_TRUE);
7049 
7050 ibd_send_fail:
7051         if (node && mp)
7052                 ibd_free_lsohdr(node, mp);
7053 
7054         if (dofree)
7055                 freemsg(mp);
7056 
7057         if (node != NULL) {
7058                 if (rc_chan) {
7059                         ibd_rc_tx_cleanup(node);
7060                 } else {
7061                         ibd_tx_cleanup(state, node);
7062                 }
7063         }
7064 
7065         return (rc);
7066 }
7067 
7068 /*
7069  * GLDv3 entry point for transmitting datagram.
7070  */
7071 static mblk_t *
7072 ibd_m_tx(void *arg, mblk_t *mp)
7073 {
7074         ibd_state_t *state = (ibd_state_t *)arg;
7075         mblk_t *next;
7076 
7077         if (state->id_type == IBD_PORT_DRIVER) {
7078                 freemsgchain(mp);
7079                 return (NULL);
7080         }
7081 
7082         if ((state->id_link_state != LINK_STATE_UP) ||
7083             !(state->id_mac_state & IBD_DRV_STARTED)) {
7084                 freemsgchain(mp);
7085                 mp = NULL;
7086         }
7087 
7088         while (mp != NULL) {
7089                 next = mp->b_next;
7090                 mp->b_next = NULL;
7091                 if (ibd_send(state, mp) == B_FALSE) {
7092                         /* Send fail */
7093                         mp->b_next = next;
7094                         break;
7095                 }
7096                 mp = next;
7097         }
7098 
7099         return (mp);
7100 }
7101 
7102 /*
7103  * this handles Tx and Rx completions. With separate CQs, this handles
7104  * only Rx completions.
7105  */
7106 static uint_t
7107 ibd_intr(caddr_t arg)
7108 {
7109         ibd_state_t *state = (ibd_state_t *)arg;
7110 
7111         ibd_poll_rcq(state, state->id_rcq_hdl);
7112 
7113         return (DDI_INTR_CLAIMED);
7114 }
7115 
7116 /*
7117  * Poll and fully drain the send cq
7118  */
7119 static void
7120 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7121 {
7122         ibt_wc_t *wcs = state->id_txwcs;
7123         uint_t numwcs = state->id_txwcs_size;
7124         ibd_wqe_t *wqe;
7125         ibd_swqe_t *head, *tail;
7126         ibt_wc_t *wc;
7127         uint_t num_polled;
7128         int i;
7129 
7130         while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7131                 head = tail = NULL;
7132                 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7133                         wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7134                         if (wc->wc_status != IBT_WC_SUCCESS) {
7135                                 /*
7136                                  * Channel being torn down.
7137                                  */
7138                                 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7139                                         DPRINT(5, "ibd_drain_scq: flush error");
7140                                         DPRINT(10, "ibd_drain_scq: Bad "
7141                                             "status %d", wc->wc_status);
7142                                 } else {
7143                                         DPRINT(10, "ibd_drain_scq: "
7144                                             "unexpected wc_status %d",
7145                                             wc->wc_status);
7146                                 }
7147                                 /*
7148                                  * Fallthrough to invoke the Tx handler to
7149                                  * release held resources, e.g., AH refcount.
7150                                  */
7151                         }
7152                         /*
7153                          * Add this swqe to the list to be cleaned up.
7154                          */
7155                         if (head)
7156                                 tail->swqe_next = wqe;
7157                         else
7158                                 head = WQE_TO_SWQE(wqe);
7159                         tail = WQE_TO_SWQE(wqe);
7160                 }
7161                 tail->swqe_next = NULL;
7162                 ibd_tx_cleanup_list(state, head, tail);
7163 
7164                 /*
7165                  * Resume any blocked transmissions if possible
7166                  */
7167                 ibd_resume_transmission(state);
7168         }
7169 }
7170 
7171 /*
7172  * Poll and fully drain the receive cq
7173  */
7174 static void
7175 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7176 {
7177         ibt_wc_t *wcs = state->id_rxwcs;
7178         uint_t numwcs = state->id_rxwcs_size;
7179         ibd_rwqe_t *rwqe;
7180         ibt_wc_t *wc;
7181         uint_t num_polled;
7182         int i;
7183         mblk_t *head, *tail, *mp;
7184 
7185         while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7186                 head = tail = NULL;
7187                 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7188                         rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7189                         if (wc->wc_status != IBT_WC_SUCCESS) {
7190                                 /*
7191                                  * Channel being torn down.
7192                                  */
7193                                 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7194                                         DPRINT(5, "ibd_drain_rcq: "
7195                                             "expected flushed rwqe");
7196                                 } else {
7197                                         DPRINT(5, "ibd_drain_rcq: "
7198                                             "unexpected wc_status %d",
7199                                             wc->wc_status);
7200                                 }
7201                                 atomic_inc_32(
7202                                     &state->id_rx_list.dl_bufs_outstanding);
7203                                 freemsg(rwqe->rwqe_im_mblk);
7204                                 continue;
7205                         }
7206                         mp = ibd_process_rx(state, rwqe, wc);
7207                         if (mp == NULL)
7208                                 continue;
7209 
7210                         /*
7211                          * Add this mp to the list to send to the nw layer.
7212                          */
7213                         if (head)
7214                                 tail->b_next = mp;
7215                         else
7216                                 head = mp;
7217                         tail = mp;
7218                 }
7219                 if (head)
7220                         mac_rx(state->id_mh, state->id_rh, head);
7221 
7222                 /*
7223                  * Account for #rwqes polled.
7224                  * Post more here, if less than one fourth full.
7225                  */
7226                 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7227                     (state->id_ud_num_rwqe / 4))
7228                         ibd_post_recv_intr(state);
7229         }
7230 }
7231 
7232 /*
7233  * Common code for interrupt handling as well as for polling
7234  * for all completed wqe's while detaching.
7235  */
7236 static void
7237 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7238 {
7239         int flag, redo_flag;
7240         int redo = 1;
7241 
7242         flag = IBD_CQ_POLLING;
7243         redo_flag = IBD_REDO_CQ_POLLING;
7244 
7245         mutex_enter(&state->id_scq_poll_lock);
7246         if (state->id_scq_poll_busy & flag) {
7247                 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7248                 state->id_scq_poll_busy |= redo_flag;
7249                 mutex_exit(&state->id_scq_poll_lock);
7250                 return;
7251         }
7252         state->id_scq_poll_busy |= flag;
7253         mutex_exit(&state->id_scq_poll_lock);
7254 
7255         /*
7256          * In some cases (eg detaching), this code can be invoked on
7257          * any cpu after disabling cq notification (thus no concurrency
7258          * exists). Apart from that, the following applies normally:
7259          * Transmit completion handling could be from any cpu if
7260          * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7261          * is interrupt driven.
7262          */
7263 
7264         /*
7265          * Poll and drain the CQ
7266          */
7267         ibd_drain_scq(state, cq_hdl);
7268 
7269         /*
7270          * Enable CQ notifications and redrain the cq to catch any
7271          * completions we might have missed after the ibd_drain_scq()
7272          * above and before the ibt_enable_cq_notify() that follows.
7273          * Finally, service any new requests to poll the cq that
7274          * could've come in after the ibt_enable_cq_notify().
7275          */
7276         do {
7277                 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7278                     IBT_SUCCESS) {
7279                         DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7280                 }
7281 
7282                 ibd_drain_scq(state, cq_hdl);
7283 
7284                 mutex_enter(&state->id_scq_poll_lock);
7285                 if (state->id_scq_poll_busy & redo_flag)
7286                         state->id_scq_poll_busy &= ~redo_flag;
7287                 else {
7288                         state->id_scq_poll_busy &= ~flag;
7289                         redo = 0;
7290                 }
7291                 mutex_exit(&state->id_scq_poll_lock);
7292 
7293         } while (redo);
7294 }
7295 
7296 /*
7297  * Common code for interrupt handling as well as for polling
7298  * for all completed wqe's while detaching.
7299  */
7300 static void
7301 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7302 {
7303         int flag, redo_flag;
7304         int redo = 1;
7305 
7306         flag = IBD_CQ_POLLING;
7307         redo_flag = IBD_REDO_CQ_POLLING;
7308 
7309         mutex_enter(&state->id_rcq_poll_lock);
7310         if (state->id_rcq_poll_busy & flag) {
7311                 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7312                 state->id_rcq_poll_busy |= redo_flag;
7313                 mutex_exit(&state->id_rcq_poll_lock);
7314                 return;
7315         }
7316         state->id_rcq_poll_busy |= flag;
7317         mutex_exit(&state->id_rcq_poll_lock);
7318 
7319         /*
7320          * Poll and drain the CQ
7321          */
7322         ibd_drain_rcq(state, rcq);
7323 
7324         /*
7325          * Enable CQ notifications and redrain the cq to catch any
7326          * completions we might have missed after the ibd_drain_cq()
7327          * above and before the ibt_enable_cq_notify() that follows.
7328          * Finally, service any new requests to poll the cq that
7329          * could've come in after the ibt_enable_cq_notify().
7330          */
7331         do {
7332                 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7333                     IBT_SUCCESS) {
7334                         DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7335                 }
7336 
7337                 ibd_drain_rcq(state, rcq);
7338 
7339                 mutex_enter(&state->id_rcq_poll_lock);
7340                 if (state->id_rcq_poll_busy & redo_flag)
7341                         state->id_rcq_poll_busy &= ~redo_flag;
7342                 else {
7343                         state->id_rcq_poll_busy &= ~flag;
7344                         redo = 0;
7345                 }
7346                 mutex_exit(&state->id_rcq_poll_lock);
7347 
7348         } while (redo);
7349 }
7350 
7351 /*
7352  * Unmap the memory area associated with a given swqe.
7353  */
7354 void
7355 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7356 {
7357         ibt_status_t stat;
7358 
7359         DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7360 
7361         if (swqe->w_mi_hdl) {
7362                 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7363                     swqe->w_mi_hdl)) != IBT_SUCCESS) {
7364                         DPRINT(10,
7365                             "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7366                 }
7367                 swqe->w_mi_hdl = NULL;
7368         }
7369         swqe->w_swr.wr_nds = 0;
7370 }
7371 
7372 void
7373 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7374 {
7375         /*
7376          * The recycling logic can be eliminated from here
7377          * and put into the async thread if we create another
7378          * list to hold ACE's for unjoined mcg's.
7379          */
7380         if (DEC_REF_DO_CYCLE(ace)) {
7381                 ibd_mce_t *mce;
7382 
7383                 /*
7384                  * Check with the lock taken: we decremented
7385                  * reference count without the lock, and some
7386                  * transmitter might already have bumped the
7387                  * reference count (possible in case of multicast
7388                  * disable when we leave the AH on the active
7389                  * list). If not still 0, get out, leaving the
7390                  * recycle bit intact.
7391                  *
7392                  * Atomically transition the AH from active
7393                  * to free list, and queue a work request to
7394                  * leave the group and destroy the mce. No
7395                  * transmitter can be looking at the AH or
7396                  * the MCE in between, since we have the
7397                  * ac_mutex lock. In the SendOnly reap case,
7398                  * it is not necessary to hold the ac_mutex
7399                  * and recheck the ref count (since the AH was
7400                  * taken off the active list), we just do it
7401                  * to have uniform processing with the Full
7402                  * reap case.
7403                  */
7404                 mutex_enter(&state->id_ac_mutex);
7405                 mce = ace->ac_mce;
7406                 if (GET_REF_CYCLE(ace) == 0) {
7407                         CLEAR_REFCYCLE(ace);
7408                         /*
7409                          * Identify the case of fullmember reap as
7410                          * opposed to mcg trap reap. Also, port up
7411                          * might set ac_mce to NULL to indicate Tx
7412                          * cleanup should do no more than put the
7413                          * AH in the free list (see ibd_async_link).
7414                          */
7415                         if (mce != NULL) {
7416                                 ace->ac_mce = NULL;
7417                                 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7418                                 /*
7419                                  * mc_req was initialized at mce
7420                                  * creation time.
7421                                  */
7422                                 ibd_queue_work_slot(state,
7423                                     &mce->mc_req, IBD_ASYNC_REAP);
7424                         }
7425                         IBD_ACACHE_INSERT_FREE(state, ace);
7426                 }
7427                 mutex_exit(&state->id_ac_mutex);
7428         }
7429 }
7430 
7431 /*
7432  * Common code that deals with clean ups after a successful or
7433  * erroneous transmission attempt.
7434  */
7435 static void
7436 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7437 {
7438         ibd_ace_t *ace = swqe->w_ahandle;
7439 
7440         DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7441 
7442         /*
7443          * If this was a dynamic mapping in ibd_send(), we need to
7444          * unmap here. If this was an lso buffer we'd used for sending,
7445          * we need to release the lso buf to the pool, since the resource
7446          * is scarce. However, if this was simply a normal send using
7447          * the copybuf (present in each swqe), we don't need to release it.
7448          */
7449         if (swqe->swqe_im_mblk != NULL) {
7450                 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7451                         ibd_unmap_mem(state, swqe);
7452                 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7453                         ibd_release_lsobufs(state,
7454                             swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7455                 }
7456                 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7457                 freemsg(swqe->swqe_im_mblk);
7458                 swqe->swqe_im_mblk = NULL;
7459         }
7460 
7461         /*
7462          * Drop the reference count on the AH; it can be reused
7463          * now for a different destination if there are no more
7464          * posted sends that will use it. This can be eliminated
7465          * if we can always associate each Tx buffer with an AH.
7466          * The ace can be null if we are cleaning up from the
7467          * ibd_send() error path.
7468          */
7469         if (ace != NULL) {
7470                 ibd_dec_ref_ace(state, ace);
7471         }
7472 
7473         /*
7474          * Release the send wqe for reuse.
7475          */
7476         swqe->swqe_next = NULL;
7477         ibd_release_swqe(state, swqe, swqe, 1);
7478 }
7479 
7480 static void
7481 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7482 {
7483         ibd_ace_t *ace;
7484         ibd_swqe_t *swqe;
7485         int n = 0;
7486 
7487         DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7488 
7489         for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7490 
7491                 /*
7492                  * If this was a dynamic mapping in ibd_send(), we need to
7493                  * unmap here. If this was an lso buffer we'd used for sending,
7494                  * we need to release the lso buf to the pool, since the
7495                  * resource is scarce. However, if this was simply a normal
7496                  * send using the copybuf (present in each swqe), we don't need
7497                  * to release it.
7498                  */
7499                 if (swqe->swqe_im_mblk != NULL) {
7500                         if (swqe->w_buftype == IBD_WQE_MAPPED) {
7501                                 ibd_unmap_mem(state, swqe);
7502                         } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7503                                 ibd_release_lsobufs(state,
7504                                     swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7505                         }
7506                         ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7507                         freemsg(swqe->swqe_im_mblk);
7508                         swqe->swqe_im_mblk = NULL;
7509                 }
7510 
7511                 /*
7512                  * Drop the reference count on the AH; it can be reused
7513                  * now for a different destination if there are no more
7514                  * posted sends that will use it. This can be eliminated
7515                  * if we can always associate each Tx buffer with an AH.
7516                  * The ace can be null if we are cleaning up from the
7517                  * ibd_send() error path.
7518                  */
7519                 ace = swqe->w_ahandle;
7520                 if (ace != NULL) {
7521                         ibd_dec_ref_ace(state, ace);
7522                 }
7523                 n++;
7524         }
7525 
7526         /*
7527          * Release the send wqes for reuse.
7528          */
7529         ibd_release_swqe(state, head, tail, n);
7530 }
7531 
7532 /*
7533  * Processing to be done after receipt of a packet; hand off to GLD
7534  * in the format expected by GLD.  The received packet has this
7535  * format: 2b sap :: 00 :: data.
7536  */
7537 static mblk_t *
7538 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7539 {
7540         ib_header_info_t *phdr;
7541         mblk_t *mp;
7542         ipoib_hdr_t *ipibp;
7543         ipha_t *iphap;
7544         ip6_t *ip6h;
7545         int len;
7546         ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7547         uint32_t bufs;
7548 
7549         /*
7550          * Track number handed to upper layer that need to be returned.
7551          */
7552         bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7553 
7554         /* Never run out of rwqes, use allocb when running low */
7555         if (bufs >= state->id_rx_bufs_outstanding_limit) {
7556                 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7557                 atomic_inc_32(&state->id_rx_allocb);
7558                 mp = allocb(pkt_len, BPRI_HI);
7559                 if (mp) {
7560                         bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7561                         ibd_post_recv(state, rwqe);
7562                 } else {        /* no memory */
7563                         atomic_inc_32(&state->id_rx_allocb_failed);
7564                         ibd_post_recv(state, rwqe);
7565                         return (NULL);
7566                 }
7567         } else {
7568                 mp = rwqe->rwqe_im_mblk;
7569         }
7570 
7571 
7572         /*
7573          * Adjust write pointer depending on how much data came in.
7574          */
7575         mp->b_wptr = mp->b_rptr + pkt_len;
7576 
7577         /*
7578          * Make sure this is NULL or we're in trouble.
7579          */
7580         if (mp->b_next != NULL) {
7581                 ibd_print_warn(state,
7582                     "ibd_process_rx: got duplicate mp from rcq?");
7583                 mp->b_next = NULL;
7584         }
7585 
7586         /*
7587          * the IB link will deliver one of the IB link layer
7588          * headers called, the Global Routing Header (GRH).
7589          * ibd driver uses the information in GRH to build the
7590          * Header_info structure and pass it with the datagram up
7591          * to GLDv3.
7592          * If the GRH is not valid, indicate to GLDv3 by setting
7593          * the VerTcFlow field to 0.
7594          */
7595         phdr = (ib_header_info_t *)mp->b_rptr;
7596         if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7597                 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7598 
7599                 /* if it is loop back packet, just drop it. */
7600                 if (state->id_enable_rc) {
7601                         if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7602                             &state->rc_macaddr_loopback,
7603                             IPOIB_ADDRL) == 0) {
7604                                 freemsg(mp);
7605                                 return (NULL);
7606                         }
7607                 } else {
7608                         if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7609                             IPOIB_ADDRL) == 0) {
7610                                 freemsg(mp);
7611                                 return (NULL);
7612                         }
7613                 }
7614 
7615                 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7616                     sizeof (ipoib_mac_t));
7617                 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
7618                         phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
7619                         IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
7620                 } else {
7621                         phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
7622                 }
7623         } else {
7624                 /*
7625                  * It can not be a IBA multicast packet. Must have been
7626                  * unicast for us. Just copy the interface address to dst.
7627                  */
7628                 phdr->ib_grh.ipoib_vertcflow = 0;
7629                 ovbcopy(&state->id_macaddr, &phdr->ib_dst,
7630                     sizeof (ipoib_mac_t));
7631         }
7632 
7633         /*
7634          * For ND6 packets, padding is at the front of the source/target
7635          * lladdr. However the inet6 layer is not aware of it, hence remove
7636          * the padding from such packets.
7637          */
7638         ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
7639         if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
7640                 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7641                 len = ntohs(ip6h->ip6_plen);
7642                 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7643                         /* LINTED: E_CONSTANT_CONDITION */
7644                         IBD_PAD_NSNA(ip6h, len, IBD_RECV);
7645                 }
7646         }
7647 
7648         /*
7649          * Update statistics
7650          */
7651         atomic_add_64(&state->id_rcv_bytes, pkt_len);
7652         atomic_inc_64(&state->id_rcv_pkt);
7653         if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7654                 atomic_inc_64(&state->id_brd_rcv);
7655         else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7656                 atomic_inc_64(&state->id_multi_rcv);
7657 
7658         iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7659         /*
7660          * Set receive checksum status in mp
7661          * Hardware checksumming can be considered valid only if:
7662          * 1. CQE.IP_OK bit is set
7663          * 2. CQE.CKSUM = 0xffff
7664          * 3. IPv6 routing header is not present in the packet
7665          * 4. If there are no IP_OPTIONS in the IP HEADER
7666          */
7667 
7668         if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
7669             (wc->wc_cksum == 0xFFFF) &&
7670             (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
7671                 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
7672         }
7673 
7674         return (mp);
7675 }
7676 
7677 /*
7678  * Callback code invoked from STREAMs when the receive data buffer is
7679  * free for recycling.
7680  */
7681 static void
7682 ibd_freemsg_cb(char *arg)
7683 {
7684         ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
7685         ibd_state_t *state = rwqe->w_state;
7686 
7687         atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7688 
7689         /*
7690          * If the driver is stopped, just free the rwqe.
7691          */
7692         if (atomic_add_32_nv(&state->id_running, 0) == 0) {
7693                 DPRINT(6, "ibd_freemsg: wqe being freed");
7694                 rwqe->rwqe_im_mblk = NULL;
7695                 ibd_free_rwqe(state, rwqe);
7696                 return;
7697         }
7698 
7699         rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
7700             state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
7701         if (rwqe->rwqe_im_mblk == NULL) {
7702                 ibd_free_rwqe(state, rwqe);
7703                 DPRINT(6, "ibd_freemsg: desballoc failed");
7704                 return;
7705         }
7706 
7707         ibd_post_recv(state, rwqe);
7708 }
7709 
7710 static uint_t
7711 ibd_tx_recycle(caddr_t arg)
7712 {
7713         ibd_state_t *state = (ibd_state_t *)arg;
7714 
7715         /*
7716          * Poll for completed entries
7717          */
7718         ibd_poll_scq(state, state->id_scq_hdl);
7719 
7720         return (DDI_INTR_CLAIMED);
7721 }
7722 
7723 #ifdef IBD_LOGGING
7724 static void
7725 ibd_log_init(void)
7726 {
7727         ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
7728         ibd_lbuf_ndx = 0;
7729 
7730         mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
7731 }
7732 
7733 static void
7734 ibd_log_fini(void)
7735 {
7736         if (ibd_lbuf)
7737                 kmem_free(ibd_lbuf, IBD_LOG_SZ);
7738         ibd_lbuf_ndx = 0;
7739         ibd_lbuf = NULL;
7740 
7741         mutex_destroy(&ibd_lbuf_lock);
7742 }
7743 
7744 static void
7745 ibd_log(const char *fmt, ...)
7746 {
7747         va_list ap;
7748         uint32_t off;
7749         uint32_t msglen;
7750         char tmpbuf[IBD_DMAX_LINE];
7751 
7752         if (ibd_lbuf == NULL)
7753                 return;
7754 
7755         va_start(ap, fmt);
7756         msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
7757         va_end(ap);
7758 
7759         if (msglen >= IBD_DMAX_LINE)
7760                 msglen = IBD_DMAX_LINE - 1;
7761 
7762         mutex_enter(&ibd_lbuf_lock);
7763 
7764         off = ibd_lbuf_ndx;             /* current msg should go here */
7765         if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
7766                 ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
7767 
7768         ibd_lbuf_ndx += msglen;         /* place where next msg should start */
7769         ibd_lbuf[ibd_lbuf_ndx] = 0;     /* current msg should terminate */
7770 
7771         if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
7772                 ibd_lbuf_ndx = 0;
7773 
7774         mutex_exit(&ibd_lbuf_lock);
7775 
7776         bcopy(tmpbuf, ibd_lbuf+off, msglen);    /* no lock needed for this */
7777 }
7778 #endif
7779 
7780 /* ARGSUSED */
7781 static int
7782 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
7783     int *rvalp)
7784 {
7785         ibd_create_ioctl_t      *cmd = karg;
7786         ibd_state_t             *state, *port_state, *p;
7787         int                     i, err, rval = 0;
7788         mac_register_t          *macp;
7789         ibt_hca_portinfo_t      *pinfop = NULL;
7790         ibt_status_t            ibt_status;
7791         uint_t                  psize, pinfosz;
7792         boolean_t               force_create = B_FALSE;
7793 
7794         cmd->ibdioc.ioc_status = 0;
7795 
7796         if (cmd->ibdioc.ioc_port_inst < 0) {
7797                 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
7798                 return (EINVAL);
7799         }
7800         port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
7801         if (port_state == NULL) {
7802                 DPRINT(10, "ibd_create_partition: failed to get state %d",
7803                     cmd->ibdioc.ioc_port_inst);
7804                 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
7805                 return (EINVAL);
7806         }
7807 
7808         /* Limited PKeys not supported */
7809         if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
7810                 rval = EINVAL;
7811                 goto part_create_return;
7812         }
7813 
7814         if (cmd->ioc_force_create == 0) {
7815                 /*
7816                  * Check if the port pkey table contains the pkey for which
7817                  * this partition is being created.
7818                  */
7819                 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
7820                     port_state->id_port, &pinfop, &psize, &pinfosz);
7821 
7822                 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
7823                         rval = EINVAL;
7824                         goto part_create_return;
7825                 }
7826 
7827                 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
7828                         rval = ENETDOWN;
7829                         cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
7830                         goto part_create_return;
7831                 }
7832 
7833                 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
7834                         if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
7835                                 break;
7836                         }
7837                 }
7838                 if (i == pinfop->p_pkey_tbl_sz) {
7839                         rval = EINVAL;
7840                         cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
7841                         goto part_create_return;
7842                 }
7843         } else {
7844                 force_create = B_TRUE;
7845         }
7846 
7847         mutex_enter(&ibd_objlist_lock);
7848         for (p = ibd_objlist_head; p; p = p->id_next) {
7849                 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
7850                     (p->id_pkey == cmd->ioc_pkey) &&
7851                     (p->id_plinkid == cmd->ioc_partid)) {
7852                         mutex_exit(&ibd_objlist_lock);
7853                         rval = EEXIST;
7854                         cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
7855                         goto part_create_return;
7856                 }
7857         }
7858         mutex_exit(&ibd_objlist_lock);
7859 
7860         state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
7861 
7862         state->id_type               = IBD_PARTITION_OBJ;
7863 
7864         state->id_plinkid    = cmd->ioc_partid;
7865         state->id_dlinkid    = cmd->ibdioc.ioc_linkid;
7866         state->id_port_inst  = cmd->ibdioc.ioc_port_inst;
7867 
7868         state->id_dip                = port_state->id_dip;
7869         state->id_port               = port_state->id_port;
7870         state->id_pkey               = cmd->ioc_pkey;
7871         state->id_hca_guid   = port_state->id_hca_guid;
7872         state->id_port_guid  = port_state->id_port_guid;
7873         state->id_force_create       = force_create;
7874 
7875         mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
7876         cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
7877 
7878         if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
7879                 rval = EIO;
7880                 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
7881                 goto fail;
7882         }
7883 
7884         if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
7885                 rval = EAGAIN;
7886                 goto fail;
7887         }
7888 
7889         macp->m_type_ident   = MAC_PLUGIN_IDENT_IB;
7890         macp->m_dip          = port_state->id_dip;
7891         macp->m_instance     = (uint_t)-1;
7892         macp->m_driver               = state;
7893         macp->m_src_addr     = (uint8_t *)&state->id_macaddr;
7894         macp->m_callbacks    = &ibd_m_callbacks;
7895         macp->m_min_sdu              = 0;
7896         macp->m_multicast_sdu        = IBD_DEF_MAX_SDU;
7897         if (state->id_enable_rc) {
7898                 macp->m_max_sdu              = IBD_DEF_RC_MAX_SDU;
7899         } else {
7900                 macp->m_max_sdu              = IBD_DEF_MAX_SDU;
7901         }
7902         macp->m_priv_props = ibd_priv_props;
7903 
7904         err = mac_register(macp, &state->id_mh);
7905         mac_free(macp);
7906 
7907         if (err != 0) {
7908                 DPRINT(10, "ibd_create_partition: mac_register() failed %d",
7909                     err);
7910                 rval = err;
7911                 goto fail;
7912         }
7913 
7914         err = dls_devnet_create(state->id_mh,
7915             cmd->ioc_partid, crgetzoneid(credp));
7916         if (err != 0) {
7917                 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
7918                     "%d", err);
7919                 rval = err;
7920                 (void) mac_unregister(state->id_mh);
7921                 goto fail;
7922         }
7923 
7924         /*
7925          * Add the new partition state structure to the list
7926          */
7927         mutex_enter(&ibd_objlist_lock);
7928         if (ibd_objlist_head)
7929                 state->id_next = ibd_objlist_head;
7930 
7931         ibd_objlist_head = state;
7932         mutex_exit(&ibd_objlist_lock);
7933 
7934 part_create_return:
7935         if (pinfop) {
7936                 ibt_free_portinfo(pinfop, pinfosz);
7937         }
7938         return (rval);
7939 
7940 fail:
7941         if (pinfop) {
7942                 ibt_free_portinfo(pinfop, pinfosz);
7943         }
7944         ibd_part_unattach(state);
7945         kmem_free(state, sizeof (ibd_state_t));
7946         return (rval);
7947 }
7948 
7949 /* ARGSUSED */
7950 static int
7951 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
7952     int *rvalp)
7953 {
7954         int err;
7955         datalink_id_t tmpid;
7956         ibd_state_t *node, *prev;
7957         ibd_delete_ioctl_t *cmd = karg;
7958 
7959         prev = NULL;
7960 
7961         mutex_enter(&ibd_objlist_lock);
7962         node = ibd_objlist_head;
7963 
7964         /* Find the ibd state structure corresponding to the partition */
7965         while (node != NULL) {
7966                 if (node->id_plinkid == cmd->ioc_partid)
7967                         break;
7968                 prev = node;
7969                 node = node->id_next;
7970         }
7971 
7972         if (node == NULL) {
7973                 mutex_exit(&ibd_objlist_lock);
7974                 return (ENOENT);
7975         }
7976 
7977         if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
7978                 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
7979                     "%d", err);
7980                 mutex_exit(&ibd_objlist_lock);
7981                 return (err);
7982         }
7983 
7984         /*
7985          * Call ibd_part_unattach() only after making sure that the instance has
7986          * not been started yet and is also not in late hca init mode.
7987          */
7988         ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
7989 
7990         err = 0;
7991         if ((node->id_mac_state & IBD_DRV_STARTED) ||
7992             (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
7993             (ibd_part_busy(node) != DDI_SUCCESS) ||
7994             ((err = mac_disable(node->id_mh)) != 0)) {
7995                 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
7996                     crgetzoneid(credp));
7997                 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
7998                 mutex_exit(&ibd_objlist_lock);
7999                 return (err != 0 ? err : EBUSY);
8000         }
8001 
8002         node->id_mac_state |= IBD_DRV_IN_DELETION;
8003 
8004         ibd_part_unattach(node);
8005 
8006         ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8007 
8008         /* Remove the partition state structure from the linked list */
8009         if (prev == NULL)
8010                 ibd_objlist_head = node->id_next;
8011         else
8012                 prev->id_next = node->id_next;
8013         mutex_exit(&ibd_objlist_lock);
8014 
8015         if ((err = mac_unregister(node->id_mh)) != 0) {
8016                 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8017                     err);
8018         }
8019 
8020         cv_destroy(&node->id_macst_cv);
8021         mutex_destroy(&node->id_macst_lock);
8022 
8023         kmem_free(node, sizeof (ibd_state_t));
8024 
8025         return (0);
8026 }
8027 
8028 /* ARGSUSED */
8029 static int
8030 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8031     int *rvalp)
8032 {
8033         ibd_ioctl_t             cmd;
8034         ibpart_ioctl_t          partioc;
8035         ibport_ioctl_t          portioc;
8036 #ifdef _MULTI_DATAMODEL
8037         ibport_ioctl32_t        portioc32;
8038 #endif
8039         ibd_state_t             *state, *port_state;
8040         int                     size;
8041         ibt_hca_portinfo_t      *pinfop = NULL;
8042         ibt_status_t            ibt_status;
8043         uint_t                  psize, pinfosz;
8044         int                     rval = 0;
8045 
8046         size = sizeof (ibd_ioctl_t);
8047         if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8048                 return (EFAULT);
8049         }
8050         cmd.ioc_status = 0;
8051         switch (cmd.ioc_info_cmd) {
8052         case IBD_INFO_CMD_IBPART:
8053                 size = sizeof (ibpart_ioctl_t);
8054                 if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8055                         return (EFAULT);
8056                 }
8057 
8058                 mutex_enter(&ibd_objlist_lock);
8059                 /* Find the ibd state structure corresponding the partition */
8060                 for (state = ibd_objlist_head; state; state = state->id_next) {
8061                         if (state->id_plinkid == cmd.ioc_linkid) {
8062                                 break;
8063                         }
8064                 }
8065 
8066                 if (state == NULL) {
8067                         mutex_exit(&ibd_objlist_lock);
8068                         return (ENOENT);
8069                 }
8070 
8071                 partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8072                 partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8073                 partioc.ibdioc.ioc_portnum = state->id_port;
8074                 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8075                 partioc.ibdioc.ioc_portguid = state->id_port_guid;
8076                 partioc.ibdioc.ioc_status = 0;
8077                 partioc.ioc_partid = state->id_plinkid;
8078                 partioc.ioc_pkey = state->id_pkey;
8079                 partioc.ioc_force_create = state->id_force_create;
8080                 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8081                         mutex_exit(&ibd_objlist_lock);
8082                         return (EFAULT);
8083                 }
8084                 mutex_exit(&ibd_objlist_lock);
8085 
8086                 break;
8087 
8088         case IBD_INFO_CMD_IBPORT:
8089                 if ((cmd.ioc_port_inst < 0) || ((port_state =
8090                     ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8091                         DPRINT(10, "ibd_create_partition: failed to get"
8092                             " state %d", cmd.ioc_port_inst);
8093                         size = sizeof (ibd_ioctl_t);
8094                         cmd.ioc_status = IBD_INVALID_PORT_INST;
8095                         if (ddi_copyout((void *)&cmd, (void *)arg, size,
8096                             mode)) {
8097                                 return (EFAULT);
8098                         }
8099                         return (EINVAL);
8100                 }
8101                 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8102                     port_state->id_port, &pinfop, &psize, &pinfosz);
8103                 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8104                         return (EINVAL);
8105                 }
8106 #ifdef _MULTI_DATAMODEL
8107                 switch (ddi_model_convert_from(mode & FMODELS)) {
8108                 case DDI_MODEL_ILP32: {
8109                         size = sizeof (ibport_ioctl32_t);
8110                         if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8111                                 rval = EFAULT;
8112                                 goto fail;
8113                         }
8114                         portioc32.ibdioc.ioc_status = 0;
8115                         portioc32.ibdioc.ioc_portnum = port_state->id_port;
8116                         portioc32.ibdioc.ioc_hcaguid =
8117                             port_state->id_hca_guid;
8118                         portioc32.ibdioc.ioc_portguid =
8119                             port_state->id_port_guid;
8120                         if (portioc32.ioc_pkey_tbl_sz !=
8121                             pinfop->p_pkey_tbl_sz) {
8122                                 rval = EINVAL;
8123                                 size = sizeof (ibd_ioctl_t);
8124                                 portioc32.ibdioc.ioc_status =
8125                                     IBD_INVALID_PKEY_TBL_SIZE;
8126                                 if (ddi_copyout((void *)&portioc32.ibdioc,
8127                                     (void *)arg, size, mode)) {
8128                                         rval = EFAULT;
8129                                         goto fail;
8130                                 }
8131                                 goto fail;
8132                         }
8133                         size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8134                         if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8135                             (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8136                             mode)) {
8137                                 rval = EFAULT;
8138                                 goto fail;
8139                         }
8140                         size = sizeof (ibport_ioctl32_t);
8141                         if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8142                             mode)) {
8143                                 rval = EFAULT;
8144                                 goto fail;
8145                         }
8146                         break;
8147                 }
8148                 case DDI_MODEL_NONE:
8149                         size = sizeof (ibport_ioctl_t);
8150                         if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8151                                 rval = EFAULT;
8152                                 goto fail;
8153                         }
8154                         portioc.ibdioc.ioc_status = 0;
8155                         portioc.ibdioc.ioc_portnum = port_state->id_port;
8156                         portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8157                         portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8158                         if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8159                                 rval = EINVAL;
8160                                 size = sizeof (ibd_ioctl_t);
8161                                 portioc.ibdioc.ioc_status =
8162                                     IBD_INVALID_PKEY_TBL_SIZE;
8163                                 if (ddi_copyout((void *)&portioc.ibdioc,
8164                                     (void *)arg, size, mode)) {
8165                                         rval = EFAULT;
8166                                         goto fail;
8167                                 }
8168                                 goto fail;
8169                         }
8170                         size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8171                         if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8172                             (void *)(portioc.ioc_pkeys), size, mode)) {
8173                                 rval = EFAULT;
8174                                 goto fail;
8175                         }
8176                         size = sizeof (ibport_ioctl_t);
8177                         if (ddi_copyout((void *)&portioc, (void *)arg, size,
8178                             mode)) {
8179                                 rval = EFAULT;
8180                                 goto fail;
8181                         }
8182                         break;
8183                 }
8184 #else /* ! _MULTI_DATAMODEL */
8185                 size = sizeof (ibport_ioctl_t);
8186                 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8187                         rval = EFAULT;
8188                         goto fail;
8189                 }
8190                 portioc.ibdioc.ioc_status = 0;
8191                 portioc.ibdioc.ioc_portnum = port_state->id_port;
8192                 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8193                 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8194                 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8195                         rval = EINVAL;
8196                         size = sizeof (ibd_ioctl_t);
8197                         portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8198                         if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8199                             size, mode)) {
8200                                 rval = EFAULT;
8201                                 goto fail;
8202                         }
8203                         goto fail;
8204                 }
8205                 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8206                 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8207                     (void *)(portioc.ioc_pkeys), size, mode)) {
8208                         rval = EFAULT;
8209                         goto fail;
8210                 }
8211                 size = sizeof (ibport_ioctl_t);
8212                 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8213                     mode)) {
8214                         rval = EFAULT;
8215                         goto fail;
8216                 }
8217 #endif /* _MULTI_DATAMODEL */
8218 
8219                 break;
8220 
8221         case IBD_INFO_CMD_PKEYTBLSZ:
8222                 if ((cmd.ioc_port_inst < 0) || ((port_state =
8223                     ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8224                         DPRINT(10, "ibd_create_partition: failed to get"
8225                             " state %d", cmd.ioc_port_inst);
8226                         size = sizeof (ibd_ioctl_t);
8227                         cmd.ioc_status = IBD_INVALID_PORT_INST;
8228                         if (ddi_copyout((void *)&cmd, (void *)arg, size,
8229                             mode)) {
8230                                 return (EFAULT);
8231                         }
8232                         return (EINVAL);
8233                 }
8234                 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8235                     port_state->id_port, &pinfop, &psize, &pinfosz);
8236                 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8237                         return (EINVAL);
8238                 }
8239 #ifdef _MULTI_DATAMODEL
8240                 switch (ddi_model_convert_from(mode & FMODELS)) {
8241                 case DDI_MODEL_ILP32: {
8242                         size = sizeof (ibport_ioctl32_t);
8243                         if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8244                                 rval = EFAULT;
8245                                 goto fail;
8246                         }
8247                         portioc32.ibdioc.ioc_status = 0;
8248                         portioc32.ibdioc.ioc_portnum = port_state->id_port;
8249                         portioc32.ibdioc.ioc_hcaguid =
8250                             port_state->id_hca_guid;
8251                         portioc32.ibdioc.ioc_portguid =
8252                             port_state->id_port_guid;
8253                         portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8254                         if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8255                             mode)) {
8256                                 rval = EFAULT;
8257                                 goto fail;
8258                         }
8259                         break;
8260                 }
8261                 case DDI_MODEL_NONE:
8262                         size = sizeof (ibport_ioctl_t);
8263                         if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8264                                 rval = EFAULT;
8265                                 goto fail;
8266                         }
8267                         portioc.ibdioc.ioc_status = 0;
8268                         portioc.ibdioc.ioc_portnum = port_state->id_port;
8269                         portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8270                         portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8271                         portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8272                         if (ddi_copyout((void *)&portioc, (void *)arg, size,
8273                             mode)) {
8274                                 rval = EFAULT;
8275                                 goto fail;
8276                         }
8277                         break;
8278                 }
8279 #else /* ! _MULTI_DATAMODEL */
8280                 size = sizeof (ibport_ioctl_t);
8281                 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8282                         rval = EFAULT;
8283                         goto fail;
8284                 }
8285                 portioc.ibdioc.ioc_status = 0;
8286                 portioc.ibdioc.ioc_portnum = port_state->id_port;
8287                 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8288                 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8289                 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8290                 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8291                     mode)) {
8292                         rval = EFAULT;
8293                         goto fail;
8294                 }
8295 #endif /* _MULTI_DATAMODEL */
8296                 break;
8297 
8298         default:
8299                 return (EINVAL);
8300 
8301         } /* switch (cmd.ioc_info_cmd) */
8302 fail:
8303         if (pinfop) {
8304                 ibt_free_portinfo(pinfop, pinfosz);
8305         }
8306         return (rval);
8307 }
8308 
8309 /* ARGSUSED */
8310 static void
8311 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8312     ibt_async_code_t code, ibt_async_event_t *event)
8313 {
8314         ibd_state_t *state = (ibd_state_t *)arg;
8315         link_state_t    lstate;
8316 
8317         switch (code) {
8318         case IBT_EVENT_PORT_UP:
8319         case IBT_ERROR_PORT_DOWN:
8320                 if (ibd_get_port_state(state, &lstate) != 0)
8321                         break;
8322 
8323                 if (state->id_link_state != lstate) {
8324                         state->id_link_state = lstate;
8325                         mac_link_update(state->id_mh, lstate);
8326                 }
8327                 break;
8328         default:
8329                 break;
8330         }
8331 }
8332 
8333 static int
8334 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8335 {
8336         ibt_hca_portinfo_t *port_infop;
8337         uint_t psize, port_infosz;
8338         ibt_status_t    ret;
8339 
8340         ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8341             &port_infop, &psize, &port_infosz);
8342         if ((ret != IBT_SUCCESS) || (psize != 1))
8343                 return (-1);
8344 
8345         state->id_sgid = *port_infop->p_sgid_tbl;
8346         state->id_link_speed = ibd_get_portspeed(state);
8347 
8348         if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8349                 *lstate = LINK_STATE_UP;
8350         else
8351                 *lstate = LINK_STATE_DOWN;
8352 
8353         ibt_free_portinfo(port_infop, port_infosz);
8354         return (0);
8355 }
8356 
8357 static int
8358 ibd_port_attach(dev_info_t *dip)
8359 {
8360         ibd_state_t             *state;
8361         link_state_t            lstate;
8362         int                     instance;
8363         ibt_status_t            ret;
8364 
8365         /*
8366          * Allocate softstate structure
8367          */
8368         instance = ddi_get_instance(dip);
8369         if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8370                 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8371                 return (DDI_FAILURE);
8372         }
8373 
8374         state = ddi_get_soft_state(ibd_list, instance);
8375 
8376         state->id_dip = dip;
8377         state->id_type = IBD_PORT_DRIVER;
8378 
8379         if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8380             "port-number", 0)) == 0) {
8381                 DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8382                     state->id_port);
8383                 return (DDI_FAILURE);
8384         }
8385         if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8386             "hca-guid", 0)) == 0) {
8387                 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8388                     state->id_hca_guid);
8389                 return (DDI_FAILURE);
8390         }
8391         if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8392             "port-guid", 0)) == 0) {
8393                 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8394                     state->id_port_guid);
8395                 return (DDI_FAILURE);
8396         }
8397 
8398         /*
8399          * Attach to IBTL
8400          */
8401         if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8402             &state->id_ibt_hdl)) != IBT_SUCCESS) {
8403                 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8404                     ret);
8405                 goto done;
8406         }
8407 
8408         state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8409 
8410         if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8411             &state->id_hca_hdl)) != IBT_SUCCESS) {
8412                 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8413                     ret);
8414                 goto done;
8415         }
8416         state->id_mac_state |= IBD_DRV_HCA_OPENED;
8417 
8418         /* Update link status */
8419 
8420         if (ibd_get_port_state(state, &lstate) != 0) {
8421                 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8422                     ret);
8423                 goto done;
8424         }
8425         state->id_link_state = lstate;
8426         /*
8427          * Register ibd interfaces with the Nemo framework
8428          */
8429         if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8430                 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8431                 goto done;
8432         }
8433         state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8434 
8435         mac_link_update(state->id_mh, lstate);
8436 
8437         return (DDI_SUCCESS);
8438 done:
8439         (void) ibd_port_unattach(state, dip);
8440         return (DDI_FAILURE);
8441 }
8442 
8443 static int
8444 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8445 {
8446         int instance;
8447         uint32_t progress = state->id_mac_state;
8448         ibt_status_t ret;
8449 
8450         if (progress & IBD_DRV_MAC_REGISTERED) {
8451                 (void) mac_unregister(state->id_mh);
8452                 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8453         }
8454 
8455         if (progress & IBD_DRV_HCA_OPENED) {
8456                 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8457                     IBT_SUCCESS) {
8458                         ibd_print_warn(state, "failed to close "
8459                             "HCA device, ret=%d", ret);
8460                 }
8461                 state->id_hca_hdl = NULL;
8462                 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8463         }
8464 
8465         if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8466                 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8467                         ibd_print_warn(state,
8468                             "ibt_detach() failed, ret=%d", ret);
8469                 }
8470                 state->id_ibt_hdl = NULL;
8471                 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8472         }
8473         instance = ddi_get_instance(dip);
8474         ddi_soft_state_free(ibd_list, instance);
8475 
8476         return (DDI_SUCCESS);
8477 }
8478 
8479 ibt_status_t
8480 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8481 {
8482         ibd_state_t     *state;
8483 
8484         mutex_enter(&ibd_objlist_lock);
8485 
8486         /* Find the ibd state structure corresponding the partition */
8487         for (state = ibd_objlist_head; state; state = state->id_next) {
8488                 if (state->id_plinkid == linkid) {
8489                         break;
8490                 }
8491         }
8492 
8493         if (state == NULL) {
8494                 mutex_exit(&ibd_objlist_lock);
8495                 return (IBT_NO_SUCH_OBJECT);
8496         }
8497 
8498         attr->pa_dlinkid = state->id_dlinkid;
8499         attr->pa_plinkid = state->id_plinkid;
8500         attr->pa_port = state->id_port;
8501         attr->pa_hca_guid = state->id_hca_guid;
8502         attr->pa_port_guid = state->id_port_guid;
8503         attr->pa_pkey = state->id_pkey;
8504 
8505         mutex_exit(&ibd_objlist_lock);
8506 
8507         return (IBT_SUCCESS);
8508 }
8509 
8510 ibt_status_t
8511 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8512 {
8513         ibd_state_t     *state;
8514         int             n = 0;
8515         ibt_part_attr_t *attr;
8516 
8517         mutex_enter(&ibd_objlist_lock);
8518 
8519         for (state = ibd_objlist_head; state; state = state->id_next)
8520                 n++;
8521 
8522         *nparts = n;
8523         if (n == 0) {
8524                 *attr_list = NULL;
8525                 mutex_exit(&ibd_objlist_lock);
8526                 return (IBT_SUCCESS);
8527         }
8528 
8529         *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8530         attr = *attr_list;
8531         for (state = ibd_objlist_head; state; state = state->id_next) {
8532 #ifdef DEBUG
8533                 ASSERT(n > 0);
8534                 n--;
8535 #endif
8536                 attr->pa_dlinkid = state->id_dlinkid;
8537                 attr->pa_plinkid = state->id_plinkid;
8538                 attr->pa_port = state->id_port;
8539                 attr->pa_hca_guid = state->id_hca_guid;
8540                 attr->pa_port_guid = state->id_port_guid;
8541                 attr->pa_pkey = state->id_pkey;
8542                 attr++;
8543         }
8544 
8545         mutex_exit(&ibd_objlist_lock);
8546         return (IBT_SUCCESS);
8547 }