Print this page
    
7127  remove -Wno-missing-braces from Makefile.uts
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/ib/clients/ibd/ibd.c
          +++ new/usr/src/uts/common/io/ib/clients/ibd/ibd.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * An implementation of the IPoIB standard based on PSARC 2001/289.
  28   28   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/conf.h>
  32   32  #include <sys/ddi.h>
  33   33  #include <sys/sunddi.h>
  34   34  #include <sys/modctl.h>
  35   35  #include <sys/stropts.h>
  36   36  #include <sys/stream.h>
  37   37  #include <sys/strsun.h>
  38   38  #include <sys/strsubr.h>
  39   39  #include <sys/dlpi.h>
  40   40  #include <sys/mac_provider.h>
  41   41  
  42   42  #include <sys/pattr.h>          /* for HCK_FULLCKSUM */
  43   43  #include <sys/sysmacros.h>      /* for offsetof */
  44   44  #include <sys/disp.h>           /* for async thread pri */
  45   45  #include <sys/atomic.h>         /* for atomic_add*() */
  46   46  #include <sys/ethernet.h>       /* for ETHERTYPE_IPV6 */
  47   47  #include <netinet/in.h>         /* for netinet/ip.h below */
  48   48  #include <netinet/ip.h>         /* for struct ip */
  49   49  #include <netinet/udp.h>        /* for struct udphdr */
  50   50  #include <inet/common.h>        /* for inet/ip.h below */
  51   51  #include <inet/ip.h>            /* for ipha_t */
  52   52  #include <inet/ip6.h>           /* for ip6_t */
  53   53  #include <inet/tcp.h>           /* for tcph_t */
  54   54  #include <netinet/icmp6.h>      /* for icmp6_t */
  55   55  #include <sys/callb.h>
  56   56  #include <sys/modhash.h>
  57   57  
  58   58  #include <sys/ib/clients/ibd/ibd.h>
  59   59  #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */
  60   60  #include <sys/note.h>
  61   61  #include <sys/multidata.h>
  62   62  
  63   63  #include <sys/ib/mgt/ibmf/ibmf.h>       /* for ibd_get_portspeed */
  64   64  
  65   65  #include <sys/priv_names.h>
  66   66  #include <sys/dls.h>
  67   67  #include <sys/dld_ioc.h>
  68   68  #include <sys/policy.h>
  69   69  #include <sys/ibpart.h>
  70   70  #include <sys/file.h>
  71   71  
  72   72  /*
  73   73   * The write-up below includes details on the following:
  74   74   * 1. The dladm administrative model.
  75   75   * 2. Late HCA initialization feature.
  76   76   * 3. Brussels support and its implications to the current architecture.
  77   77   *
  78   78   * 1. The dladm administrative model.
  79   79   * ------------------------------------------
  80   80   * With the dladm model, ibnex will create one ibd instance per port. These
  81   81   * instances will be created independent of the port state.
  82   82   *
  83   83   * The ibd driver is two faceted: One side of it working as the port driver and
  84   84   * the other as the partition object driver.
  85   85   *
  86   86   * The port instance is a child of the HCA, and will have an entry in the devfs.
  87   87   * A DDI attach only happens for the port driver, and its attach is
  88   88   * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
  89   89   * handled in ibd_port_unattach().
  90   90   *
  91   91   * The partition object is only a registrant to the mac layer via mac_register()
  92   92   * and does not have an entry in the device tree. There is no DDI softstate
  93   93   * managed by the DDI framework for the partition objects. However, the state is
  94   94   * managed inside the ibd driver, and every partition object hangs off the
  95   95   * "ibd_objlist_head".
  96   96   *
  97   97   * The partition object first comes into existence when a user runs the
  98   98   * 'create-part' subcommand of dladm. This is like invoking the attach entry
  99   99   * point of the partition object. The partition object goes away with the
 100  100   * 'delete-part' subcommand of dladm. This is like invoking the detach entry
 101  101   * point of the partition object.
 102  102   *
 103  103   * The create-part and delete-part subcommands result in dld ioctls that end up
 104  104   * calling ibd_create_parition() and ibd_delete_partition respectively.
 105  105   * There ioctls are registered with the dld layer in _init() via a call to
 106  106   * dld_ioc_register().
 107  107   *
 108  108   * The port instance by itself cannot be plumbed. It is only the partition
 109  109   * objects that can be plumbed and they alone participate in I/O and not the
 110  110   * port driver.
 111  111   *
 112  112   * There are some info ioctls supported in ibd which are used by dladm(1M) to
 113  113   * display useful information. The info entry point for ibd is
 114  114   * ibd_get_partition_info().
 115  115   *
 116  116   * 2. Late HCA initialization feature.
 117  117   * ------------------------------------
 118  118   * As mentioned in section 1, the user creates the partition objects via
 119  119   * dladm(1M). It is possible that:
 120  120   * a) The physical port itself is down and the SM cannot be reached.
 121  121   * b) The PKEY specified by the used has not been created in the SM yet.
 122  122   * c) An IPoIB broadcast group for the specified PKEY is not present.
 123  123   *
 124  124   * In all of the above cases, complete initialization of the partition object is
 125  125   * not possible. However, the new model allows the creation of partition
 126  126   * objects even in such cases but will defer the initialization for later.
 127  127   * When such a partition object is plumbed, the link state will be displayed as
 128  128   * "down".
 129  129   * The driver, at this point, is listening to events that herald the
 130  130   * availability of resources -
 131  131   * i)   LINK_UP when the link becomes available
 132  132   * ii)  PORT_CHANGE when the PKEY has been created
 133  133   * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
 134  134   * created
 135  135   * via ibd_async_handler() for events i) and ii), and via
 136  136   * ibd_snet_notices_handler() for iii.
 137  137   * The driver handles these events (as and when they arrive) and completes the
 138  138   * initialization of the partition object and transitions it to a usable state.
 139  139   *
 140  140   * 3. Brussels support and its implications to the current architecture.
 141  141   * ---------------------------------------------------------------------
 142  142   * The brussels support introduces two new interfaces to the ibd driver -
 143  143   * ibd_m_getprop() and ibd_m_setprop().
 144  144   * These interfaces allow setting and retrieval of certain properties.
 145  145   * Some of them are public properties while most other are private properties
 146  146   * meant to be used by developers. Tuning the latter kind can cause
 147  147   * performance issues and should not be used without understanding the
 148  148   * implications. All properties are specific to an instance of either the
 149  149   * partition object or the port driver.
 150  150   *
 151  151   * The public properties are : mtu and linkmode.
 152  152   * mtu is a read-only property.
 153  153   * linkmode can take two values - UD and CM.
 154  154   *
 155  155   * Changing the linkmode requires some bookkeeping in the driver. The
 156  156   * capabilities need to be re-reported to the mac layer. This is done by
 157  157   * calling mac_capab_update().  The maxsdu is updated by calling
 158  158   * mac_maxsdu_update2().
 159  159   * The private properties retain their values across the change of linkmode.
 160  160   * NOTE:
 161  161   * - The port driver does not support any property apart from mtu.
 162  162   * - All other properties are only meant for the partition object.
 163  163   * - The properties cannot be set when an instance is plumbed. The
 164  164   * instance has to be unplumbed to effect any setting.
 165  165   */
 166  166  
 167  167  /*
 168  168   * Driver wide tunables
 169  169   *
 170  170   * ibd_tx_softintr
 171  171   * ibd_rx_softintr
 172  172   *     The softintr mechanism allows ibd to avoid event queue overflows if
 173  173   *     the receive/completion handlers are to be expensive. These are enabled
 174  174   *     by default.
 175  175   *
 176  176   * ibd_log_sz
 177  177   *     This specifies the size of the ibd log buffer in bytes. The buffer is
 178  178   *     allocated and logging is enabled only when IBD_LOGGING is defined.
 179  179   *
 180  180   */
 181  181  uint_t ibd_rx_softintr = 1;
 182  182  uint_t ibd_tx_softintr = 1;
 183  183  
 184  184  #ifdef IBD_LOGGING
 185  185  uint_t ibd_log_sz = 0x20000;
 186  186  #endif
 187  187  
 188  188  #ifdef IBD_LOGGING
 189  189  #define IBD_LOG_SZ                      ibd_log_sz
 190  190  #endif
 191  191  
 192  192  /* Post IBD_RX_POST_CNT receive work requests at a time. */
 193  193  #define IBD_RX_POST_CNT                 8
 194  194  
 195  195  /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
 196  196  #define IBD_LOG_RX_POST                 4
 197  197  
 198  198  /* Minimum number of receive work requests driver needs to always have */
 199  199  #define IBD_RWQE_MIN    ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
 200  200  
 201  201  /*
 202  202   * LSO parameters
 203  203   */
 204  204  #define IBD_LSO_MAXLEN                  65536
 205  205  #define IBD_LSO_BUFSZ                   8192
 206  206  
 207  207  /*
 208  208   * Async operation states
 209  209   */
 210  210  #define IBD_OP_NOTSTARTED               0
 211  211  #define IBD_OP_ONGOING                  1
 212  212  #define IBD_OP_COMPLETED                2
 213  213  #define IBD_OP_ERRORED                  3
 214  214  #define IBD_OP_ROUTERED                 4
 215  215  
 216  216  /*
 217  217   * Start/stop in-progress flags; note that restart must always remain
 218  218   * the OR of start and stop flag values.
 219  219   */
 220  220  #define IBD_DRV_START_IN_PROGRESS       0x10000000
 221  221  #define IBD_DRV_STOP_IN_PROGRESS        0x20000000
 222  222  #define IBD_DRV_RESTART_IN_PROGRESS     0x30000000
 223  223  #define IBD_DRV_DELETE_IN_PROGRESS      IBD_DRV_RESTART_IN_PROGRESS
 224  224  
 225  225  /*
 226  226   * Miscellaneous constants
 227  227   */
 228  228  #define IB_MGID_IPV4_LOWGRP_MASK        0xFFFFFFFF
 229  229  #define IBD_DEF_MAX_SDU                 2044
 230  230  #define IBD_DEF_MAX_MTU                 (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
 231  231  #define IBD_DEF_RC_MAX_SDU              65520
 232  232  #define IBD_DEF_RC_MAX_MTU              (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
 233  233  #define IBD_DEFAULT_QKEY                0xB1B
 234  234  #ifdef IBD_LOGGING
 235  235  #define IBD_DMAX_LINE                   100
 236  236  #endif
 237  237  
 238  238  /*
 239  239   * Enumerations for link states
 240  240   */
 241  241  typedef enum {
 242  242          IBD_LINK_DOWN,
 243  243          IBD_LINK_UP,
 244  244          IBD_LINK_UP_ABSENT
 245  245  } ibd_link_op_t;
 246  246  
 247  247  /*
 248  248   * Driver State Pointer
 249  249   */
 250  250  void *ibd_list;
 251  251  
 252  252  /*
 253  253   * Driver Global Data
 254  254   */
 255  255  ibd_global_state_t ibd_gstate;
 256  256  
 257  257  /*
 258  258   * Partition object list
 259  259   */
 260  260  ibd_state_t     *ibd_objlist_head = NULL;
 261  261  kmutex_t        ibd_objlist_lock;
 262  262  
 263  263  int ibd_rc_conn_timeout = 60 * 10;      /* 10 minutes */
 264  264  
 265  265  /*
 266  266   * Logging
 267  267   */
 268  268  #ifdef IBD_LOGGING
 269  269  kmutex_t ibd_lbuf_lock;
 270  270  uint8_t *ibd_lbuf;
 271  271  uint32_t ibd_lbuf_ndx;
 272  272  #endif
 273  273  
 274  274  /*
 275  275   * Required system entry points
 276  276   */
 277  277  static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
 278  278  static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
 279  279  
 280  280  /*
 281  281   * Required driver entry points for GLDv3
 282  282   */
 283  283  static int ibd_m_stat(void *, uint_t, uint64_t *);
 284  284  static int ibd_m_start(void *);
 285  285  static void ibd_m_stop(void *);
 286  286  static int ibd_m_promisc(void *, boolean_t);
 287  287  static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
 288  288  static int ibd_m_unicst(void *, const uint8_t *);
 289  289  static mblk_t *ibd_m_tx(void *, mblk_t *);
 290  290  static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
 291  291  
 292  292  static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 293  293      const void *);
 294  294  static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
 295  295  static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
 296  296      mac_prop_info_handle_t);
 297  297  static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
 298  298      const void *);
 299  299  static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
 300  300  
 301  301  /*
 302  302   * Private driver entry points for GLDv3
 303  303   */
 304  304  
 305  305  /*
 306  306   * Initialization
 307  307   */
 308  308  static int ibd_state_init(ibd_state_t *, dev_info_t *);
 309  309  static int ibd_init_txlist(ibd_state_t *);
 310  310  static int ibd_init_rxlist(ibd_state_t *);
 311  311  static int ibd_acache_init(ibd_state_t *);
 312  312  #ifdef IBD_LOGGING
 313  313  static void ibd_log_init(void);
 314  314  #endif
 315  315  
 316  316  /*
 317  317   * Termination/cleanup
 318  318   */
 319  319  static void ibd_state_fini(ibd_state_t *);
 320  320  static void ibd_fini_txlist(ibd_state_t *);
 321  321  static void ibd_fini_rxlist(ibd_state_t *);
 322  322  static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
 323  323  static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
 324  324  static void ibd_acache_fini(ibd_state_t *);
 325  325  #ifdef IBD_LOGGING
 326  326  static void ibd_log_fini(void);
 327  327  #endif
 328  328  
 329  329  /*
 330  330   * Allocation/acquire/map routines
 331  331   */
 332  332  static int ibd_alloc_tx_copybufs(ibd_state_t *);
 333  333  static int ibd_alloc_rx_copybufs(ibd_state_t *);
 334  334  static int ibd_alloc_tx_lsobufs(ibd_state_t *);
 335  335  static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
 336  336  static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
 337  337      uint32_t *);
 338  338  
 339  339  /*
 340  340   * Free/release/unmap routines
 341  341   */
 342  342  static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
 343  343  static void ibd_free_tx_copybufs(ibd_state_t *);
 344  344  static void ibd_free_rx_copybufs(ibd_state_t *);
 345  345  static void ibd_free_rx_rsrcs(ibd_state_t *);
 346  346  static void ibd_free_tx_lsobufs(ibd_state_t *);
 347  347  static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
 348  348  static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
 349  349  static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
 350  350  
 351  351  /*
 352  352   * Handlers/callback routines
 353  353   */
 354  354  static uint_t ibd_intr(caddr_t);
 355  355  static uint_t ibd_tx_recycle(caddr_t);
 356  356  static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
 357  357  static void ibd_scq_handler(ibt_cq_hdl_t, void *);
 358  358  static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
 359  359  static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
 360  360  static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
 361  361  static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
 362  362  static void ibd_freemsg_cb(char *);
 363  363  static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
 364  364      ibt_async_event_t *);
 365  365  static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
 366  366      ibt_async_event_t *);
 367  367  static void ibd_snet_notices_handler(void *, ib_gid_t,
 368  368      ibt_subnet_event_code_t, ibt_subnet_event_t *);
 369  369  
 370  370  /*
 371  371   * Send/receive routines
 372  372   */
 373  373  static boolean_t ibd_send(ibd_state_t *, mblk_t *);
 374  374  static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
 375  375  static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
 376  376  static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
 377  377  
 378  378  /*
 379  379   * Threads
 380  380   */
 381  381  static void ibd_async_work(ibd_state_t *);
 382  382  
 383  383  /*
 384  384   * Async tasks
 385  385   */
 386  386  static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
 387  387  static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
 388  388  static void ibd_async_setprom(ibd_state_t *);
 389  389  static void ibd_async_unsetprom(ibd_state_t *);
 390  390  static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
 391  391  static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
 392  392  static void ibd_async_txsched(ibd_state_t *);
 393  393  static void ibd_async_link(ibd_state_t *, ibd_req_t *);
 394  394  
 395  395  /*
 396  396   * Async task helpers
 397  397   */
 398  398  static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
 399  399  static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
 400  400  static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
 401  401  static boolean_t ibd_get_allroutergroup(ibd_state_t *,
 402  402      ipoib_mac_t *, ipoib_mac_t *);
 403  403  static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
 404  404  static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
 405  405  static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
 406  406  static ibt_status_t ibd_find_bgroup(ibd_state_t *);
 407  407  static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
 408  408  static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
 409  409  static uint64_t ibd_get_portspeed(ibd_state_t *);
 410  410  static boolean_t ibd_async_safe(ibd_state_t *);
 411  411  static void ibd_async_done(ibd_state_t *);
 412  412  static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
 413  413  static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
 414  414  static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
 415  415  static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
 416  416  
 417  417  /*
 418  418   * Helpers for attach/start routines
 419  419   */
 420  420  static int ibd_register_mac(ibd_state_t *, dev_info_t *);
 421  421  static int ibd_record_capab(ibd_state_t *);
 422  422  static int ibd_get_port_details(ibd_state_t *);
 423  423  static int ibd_alloc_cqs(ibd_state_t *);
 424  424  static int ibd_setup_ud_channel(ibd_state_t *);
 425  425  static int ibd_start(ibd_state_t *);
 426  426  static int ibd_undo_start(ibd_state_t *, link_state_t);
 427  427  static void ibd_set_mac_progress(ibd_state_t *, uint_t);
 428  428  static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
 429  429  static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
 430  430  static void ibd_part_unattach(ibd_state_t *state);
 431  431  static int ibd_port_attach(dev_info_t *);
 432  432  static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
 433  433  static int ibd_get_port_state(ibd_state_t *, link_state_t *);
 434  434  static int ibd_part_busy(ibd_state_t *);
 435  435  
 436  436  /*
 437  437   * Miscellaneous helpers
 438  438   */
 439  439  static int ibd_sched_poll(ibd_state_t *, int, int);
 440  440  static void ibd_resume_transmission(ibd_state_t *);
 441  441  static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
 442  442  static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
 443  443  static void *list_get_head(list_t *);
 444  444  static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
 445  445  static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
 446  446  
 447  447  ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
 448  448  ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
 449  449  
 450  450  #ifdef IBD_LOGGING
 451  451  static void ibd_log(const char *, ...);
 452  452  #endif
 453  453  
 454  454  DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
 455  455      nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
  
    | ↓ open down ↓ | 455 lines elided | ↑ open up ↑ | 
 456  456  
 457  457  /* Module Driver Info */
 458  458  static struct modldrv ibd_modldrv = {
 459  459          &mod_driverops,                 /* This one is a driver */
 460  460          "InfiniBand GLDv3 Driver",      /* short description */
 461  461          &ibd_dev_ops                    /* driver specific ops */
 462  462  };
 463  463  
 464  464  /* Module Linkage */
 465  465  static struct modlinkage ibd_modlinkage = {
 466      -        MODREV_1, (void *)&ibd_modldrv, NULL
      466 +        MODREV_1, { (void *)&ibd_modldrv, NULL }
 467  467  };
 468  468  
 469  469  /*
 470  470   * Module (static) info passed to IBTL during ibt_attach
 471  471   */
 472  472  static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
 473  473          IBTI_V_CURR,
 474  474          IBT_NETWORK,
 475  475          ibd_async_handler,
 476  476          NULL,
 477  477          "IBPART"
 478  478  };
 479  479  
 480  480  static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
 481  481          IBTI_V_CURR,
 482  482          IBT_NETWORK,
 483  483          ibdpd_async_handler,
 484  484          NULL,
 485  485          "IPIB"
 486  486  };
 487  487  
 488  488  /*
 489  489   * GLDv3 entry points
 490  490   */
 491  491  #define IBD_M_CALLBACK_FLAGS    \
 492  492          (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
 493  493  
 494  494  static mac_callbacks_t ibd_m_callbacks = {
 495  495          IBD_M_CALLBACK_FLAGS,
 496  496          ibd_m_stat,
 497  497          ibd_m_start,
 498  498          ibd_m_stop,
 499  499          ibd_m_promisc,
 500  500          ibd_m_multicst,
 501  501          ibd_m_unicst,
 502  502          ibd_m_tx,
 503  503          NULL,
 504  504          NULL,
 505  505          ibd_m_getcapab,
 506  506          NULL,
 507  507          NULL,
 508  508          ibd_m_setprop,
 509  509          ibd_m_getprop,
 510  510          ibd_m_propinfo
 511  511  };
 512  512  
 513  513  /* Private properties */
 514  514  char *ibd_priv_props[] = {
 515  515          "_ibd_broadcast_group",
 516  516          "_ibd_coalesce_completions",
 517  517          "_ibd_create_broadcast_group",
 518  518          "_ibd_hash_size",
 519  519          "_ibd_lso_enable",
 520  520          "_ibd_num_ah",
 521  521          "_ibd_num_lso_bufs",
 522  522          "_ibd_rc_enable_srq",
 523  523          "_ibd_rc_num_rwqe",
 524  524          "_ibd_rc_num_srq",
 525  525          "_ibd_rc_num_swqe",
 526  526          "_ibd_rc_rx_comp_count",
 527  527          "_ibd_rc_rx_comp_usec",
 528  528          "_ibd_rc_rx_copy_thresh",
 529  529          "_ibd_rc_rx_rwqe_thresh",
 530  530          "_ibd_rc_tx_comp_count",
 531  531          "_ibd_rc_tx_comp_usec",
 532  532          "_ibd_rc_tx_copy_thresh",
 533  533          "_ibd_ud_num_rwqe",
 534  534          "_ibd_ud_num_swqe",
 535  535          "_ibd_ud_rx_comp_count",
 536  536          "_ibd_ud_rx_comp_usec",
 537  537          "_ibd_ud_tx_comp_count",
 538  538          "_ibd_ud_tx_comp_usec",
 539  539          "_ibd_ud_tx_copy_thresh",
 540  540          NULL
 541  541  };
 542  542  
 543  543  static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
 544  544  static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
 545  545  static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
 546  546  
 547  547  static dld_ioc_info_t ibd_dld_ioctl_list[] = {
 548  548          {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
 549  549              ibd_create_partition, secpolicy_dl_config},
 550  550          {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
 551  551              ibd_delete_partition, secpolicy_dl_config},
 552  552          {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
 553  553              ibd_get_partition_info, NULL}
 554  554  };
 555  555  
 556  556  /*
 557  557   * Fill/clear <scope> and <p_key> in multicast/broadcast address
 558  558   */
 559  559  #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)         \
 560  560  {                                                       \
 561  561          *(uint32_t *)((char *)(maddr) + 4) |=           \
 562  562              htonl((uint32_t)(scope) << 16);             \
 563  563          *(uint32_t *)((char *)(maddr) + 8) |=           \
 564  564              htonl((uint32_t)(pkey) << 16);              \
 565  565  }
 566  566  
 567  567  #define IBD_CLEAR_SCOPE_PKEY(maddr)                     \
 568  568  {                                                       \
 569  569          *(uint32_t *)((char *)(maddr) + 4) &=           \
 570  570              htonl(~((uint32_t)0xF << 16));              \
 571  571          *(uint32_t *)((char *)(maddr) + 8) &=           \
 572  572              htonl(~((uint32_t)0xFFFF << 16));           \
 573  573  }
 574  574  
 575  575  /*
 576  576   * Rudimentary debugging support
 577  577   */
 578  578  #ifdef DEBUG
 579  579  int ibd_debuglevel = 100;
 580  580  void
 581  581  debug_print(int l, char *fmt, ...)
 582  582  {
 583  583          va_list ap;
 584  584  
 585  585          if (l < ibd_debuglevel)
 586  586                  return;
 587  587          va_start(ap, fmt);
 588  588          vcmn_err(CE_CONT, fmt, ap);
 589  589          va_end(ap);
 590  590  }
 591  591  #endif
 592  592  
 593  593  /*
 594  594   * Common routine to print warning messages; adds in hca guid, port number
 595  595   * and pkey to be able to identify the IBA interface.
 596  596   */
 597  597  void
 598  598  ibd_print_warn(ibd_state_t *state, char *fmt, ...)
 599  599  {
 600  600          ib_guid_t hca_guid;
 601  601          char ibd_print_buf[MAXNAMELEN + 256];
 602  602          int len;
 603  603          va_list ap;
 604  604          char part_name[MAXNAMELEN];
 605  605          datalink_id_t linkid = state->id_plinkid;
 606  606  
 607  607          hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
 608  608              0, "hca-guid", 0);
 609  609          (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
 610  610          len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
 611  611              "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
 612  612              ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
 613  613              (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
 614  614              part_name);
 615  615          va_start(ap, fmt);
 616  616          (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
 617  617              fmt, ap);
 618  618          cmn_err(CE_NOTE, "!%s", ibd_print_buf);
 619  619          va_end(ap);
 620  620  }
 621  621  
 622  622  /*
 623  623   * Warlock directives
 624  624   */
 625  625  
 626  626  /*
 627  627   * id_lso_lock
 628  628   *
 629  629   * state->id_lso->bkt_nfree may be accessed without a lock to
 630  630   * determine the threshold at which we have to ask the nw layer
 631  631   * to resume transmission (see ibd_resume_transmission()).
 632  632   */
 633  633  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
 634  634      ibd_state_t::id_lso))
 635  635  _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
 636  636  _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
 637  637  _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
 638  638  
 639  639  /*
 640  640   * id_scq_poll_lock
 641  641   */
 642  642  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
 643  643      ibd_state_t::id_scq_poll_busy))
 644  644  
 645  645  /*
 646  646   * id_txpost_lock
 647  647   */
 648  648  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
 649  649      ibd_state_t::id_tx_head))
 650  650  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
 651  651      ibd_state_t::id_tx_busy))
 652  652  
 653  653  /*
 654  654   * id_acache_req_lock
 655  655   */
 656  656  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 
 657  657      ibd_state_t::id_acache_req_cv))
 658  658  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 
 659  659      ibd_state_t::id_req_list))
 660  660  _NOTE(SCHEME_PROTECTS_DATA("atomic",
 661  661      ibd_acache_s::ac_ref))
 662  662  
 663  663  /*
 664  664   * id_ac_mutex
 665  665   *
 666  666   * This mutex is actually supposed to protect id_ah_op as well,
 667  667   * but this path of the code isn't clean (see update of id_ah_op
 668  668   * in ibd_async_acache(), immediately after the call to
 669  669   * ibd_async_mcache()). For now, we'll skip this check by
 670  670   * declaring that id_ah_op is protected by some internal scheme
 671  671   * that warlock isn't aware of.
 672  672   */
 673  673  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 674  674      ibd_state_t::id_ah_active))
 675  675  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 676  676      ibd_state_t::id_ah_free))
 677  677  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 678  678      ibd_state_t::id_ah_addr))
 679  679  _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
 680  680      ibd_state_t::id_ah_op))
 681  681  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 682  682      ibd_state_t::id_ah_error))
 683  683  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
 684  684      ibd_state_t::id_ac_hot_ace))
 685  685  _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
 686  686  
 687  687  /*
 688  688   * id_mc_mutex
 689  689   */
 690  690  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
 691  691      ibd_state_t::id_mc_full))
 692  692  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
 693  693      ibd_state_t::id_mc_non))
 694  694  
 695  695  /*
 696  696   * id_trap_lock
 697  697   */
 698  698  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
 699  699      ibd_state_t::id_trap_cv))
 700  700  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
 701  701      ibd_state_t::id_trap_stop))
 702  702  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
 703  703      ibd_state_t::id_trap_inprog))
 704  704  
 705  705  /*
 706  706   * id_prom_op
 707  707   */
 708  708  _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
 709  709      ibd_state_t::id_prom_op))
 710  710  
 711  711  /*
 712  712   * id_sched_lock
 713  713   */
 714  714  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
 715  715      ibd_state_t::id_sched_needed))
 716  716  
 717  717  /*
 718  718   * id_link_mutex
 719  719   */
 720  720  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 
 721  721      ibd_state_t::id_link_state))
 722  722  _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
 723  723  _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
 724  724      ibd_state_t::id_link_speed))
 725  725  _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
 726  726  
 727  727  /*
 728  728   * id_tx_list.dl_mutex
 729  729   */
 730  730  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
 731  731      ibd_state_t::id_tx_list.dl_head))
 732  732  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
 733  733      ibd_state_t::id_tx_list.dl_pending_sends))
 734  734  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
 735  735      ibd_state_t::id_tx_list.dl_cnt))
 736  736  
 737  737  /*
 738  738   * id_rx_list.dl_mutex
 739  739   */
 740  740  _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 741  741      ibd_state_t::id_rx_list.dl_bufs_outstanding))
 742  742  _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 743  743      ibd_state_t::id_rx_list.dl_cnt))
 744  744  
 745  745  /*
 746  746   * rc_timeout_lock
 747  747   */
 748  748  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
 749  749      ibd_state_t::rc_timeout_start))
 750  750  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
 751  751      ibd_state_t::rc_timeout))
 752  752  
 753  753  
 754  754  /*
 755  755   * Items protected by atomic updates
 756  756   */
 757  757  _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
 758  758      ibd_state_s::id_brd_rcv
 759  759      ibd_state_s::id_brd_xmt
 760  760      ibd_state_s::id_multi_rcv
 761  761      ibd_state_s::id_multi_xmt
 762  762      ibd_state_s::id_num_intrs
 763  763      ibd_state_s::id_rcv_bytes
 764  764      ibd_state_s::id_rcv_pkt
 765  765      ibd_state_s::id_rx_post_queue_index
 766  766      ibd_state_s::id_tx_short
 767  767      ibd_state_s::id_xmt_bytes
 768  768      ibd_state_s::id_xmt_pkt
 769  769      ibd_state_s::rc_rcv_trans_byte
 770  770      ibd_state_s::rc_rcv_trans_pkt
 771  771      ibd_state_s::rc_rcv_copy_byte
 772  772      ibd_state_s::rc_rcv_copy_pkt
 773  773      ibd_state_s::rc_xmt_bytes
 774  774      ibd_state_s::rc_xmt_small_pkt
 775  775      ibd_state_s::rc_xmt_fragmented_pkt
 776  776      ibd_state_s::rc_xmt_map_fail_pkt
 777  777      ibd_state_s::rc_xmt_map_succ_pkt
 778  778      ibd_rc_chan_s::rcq_invoking))
 779  779  
 780  780  /*
 781  781   * Non-mutex protection schemes for data elements. Almost all of
 782  782   * these are non-shared items.
 783  783   */
 784  784  _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
 785  785      callb_cpr
 786  786      ib_gid_s
 787  787      ib_header_info
 788  788      ibd_acache_rq
 789  789      ibd_acache_s::ac_mce
 790  790      ibd_acache_s::ac_chan
 791  791      ibd_mcache::mc_fullreap
 792  792      ibd_mcache::mc_jstate
 793  793      ibd_mcache::mc_req
 794  794      ibd_rwqe_s
 795  795      ibd_swqe_s
 796  796      ibd_wqe_s
 797  797      ibt_wr_ds_s::ds_va
 798  798      ibt_wr_lso_s
 799  799      ipoib_mac::ipoib_qpn
 800  800      mac_capab_lso_s
 801  801      msgb::b_next
 802  802      msgb::b_cont
 803  803      msgb::b_rptr
 804  804      msgb::b_wptr
 805  805      ibd_state_s::id_bgroup_created
 806  806      ibd_state_s::id_mac_state
 807  807      ibd_state_s::id_mtu
 808  808      ibd_state_s::id_ud_num_rwqe
 809  809      ibd_state_s::id_ud_num_swqe
 810  810      ibd_state_s::id_qpnum
 811  811      ibd_state_s::id_rcq_hdl
 812  812      ibd_state_s::id_rx_buf_sz
 813  813      ibd_state_s::id_rx_bufs
 814  814      ibd_state_s::id_rx_mr_hdl
 815  815      ibd_state_s::id_rx_wqes
 816  816      ibd_state_s::id_rxwcs
 817  817      ibd_state_s::id_rxwcs_size
 818  818      ibd_state_s::id_rx_nqueues
 819  819      ibd_state_s::id_rx_queues
 820  820      ibd_state_s::id_scope
 821  821      ibd_state_s::id_scq_hdl
 822  822      ibd_state_s::id_tx_buf_sz
 823  823      ibd_state_s::id_tx_bufs
 824  824      ibd_state_s::id_tx_mr_hdl
 825  825      ibd_state_s::id_tx_rel_list.dl_cnt
 826  826      ibd_state_s::id_tx_wqes
 827  827      ibd_state_s::id_txwcs
 828  828      ibd_state_s::id_txwcs_size
 829  829      ibd_state_s::rc_listen_hdl
 830  830      ibd_state_s::rc_listen_hdl_OFED_interop
 831  831      ibd_state_s::rc_srq_size
 832  832      ibd_state_s::rc_srq_rwqes
 833  833      ibd_state_s::rc_srq_rx_bufs
 834  834      ibd_state_s::rc_srq_rx_mr_hdl
 835  835      ibd_state_s::rc_tx_largebuf_desc_base
 836  836      ibd_state_s::rc_tx_mr_bufs
 837  837      ibd_state_s::rc_tx_mr_hdl
 838  838      ipha_s
 839  839      icmph_s
 840  840      ibt_path_info_s::pi_sid
 841  841      ibd_rc_chan_s::ace
 842  842      ibd_rc_chan_s::chan_hdl
 843  843      ibd_rc_chan_s::state
 844  844      ibd_rc_chan_s::chan_state
 845  845      ibd_rc_chan_s::is_tx_chan
 846  846      ibd_rc_chan_s::rcq_hdl
 847  847      ibd_rc_chan_s::rcq_size
 848  848      ibd_rc_chan_s::scq_hdl
 849  849      ibd_rc_chan_s::scq_size
 850  850      ibd_rc_chan_s::rx_bufs
 851  851      ibd_rc_chan_s::rx_mr_hdl
 852  852      ibd_rc_chan_s::rx_rwqes
 853  853      ibd_rc_chan_s::tx_wqes
 854  854      ibd_rc_chan_s::tx_mr_bufs
 855  855      ibd_rc_chan_s::tx_mr_hdl
 856  856      ibd_rc_chan_s::tx_rel_list.dl_cnt
 857  857      ibd_rc_chan_s::is_used
 858  858      ibd_rc_tx_largebuf_s::lb_buf
 859  859      ibd_rc_msg_hello_s
 860  860      ibt_cm_return_args_s))
 861  861  
 862  862  /*
 863  863   * ibd_rc_chan_s::next is protected by two mutexes:
 864  864   * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
 865  865   * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
 866  866   */
 867  867  _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
 868  868      ibd_rc_chan_s::next))
 869  869  
 870  870  /*
 871  871   * ibd_state_s.rc_tx_large_bufs_lock
 872  872   */
 873  873  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
 874  874      ibd_state_s::rc_tx_largebuf_free_head))
 875  875  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
 876  876      ibd_state_s::rc_tx_largebuf_nfree))
 877  877  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
 878  878      ibd_rc_tx_largebuf_s::lb_next))
 879  879  
 880  880  /*
 881  881   * ibd_acache_s.tx_too_big_mutex
 882  882   */
 883  883  _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
 884  884      ibd_acache_s::tx_too_big_ongoing))
 885  885  
 886  886  /*
 887  887   * tx_wqe_list.dl_mutex
 888  888   */
 889  889  _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
 890  890      ibd_rc_chan_s::tx_wqe_list.dl_head))
 891  891  _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
 892  892      ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
 893  893  _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
 894  894      ibd_rc_chan_s::tx_wqe_list.dl_cnt))
 895  895  
 896  896  /*
 897  897   * ibd_state_s.rc_ace_recycle_lock
 898  898   */
 899  899  _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
 900  900      ibd_state_s::rc_ace_recycle))
 901  901  
 902  902  /*
 903  903   * rc_srq_rwqe_list.dl_mutex
 904  904   */
 905  905  _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 906  906      ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
 907  907  _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
 908  908      ibd_state_t::rc_srq_rwqe_list.dl_cnt))
 909  909  
 910  910  /*
 911  911   * Non-mutex protection schemes for data elements. They are counters
 912  912   * for problem diagnosis. Don't need be protected.
 913  913   */
 914  914  _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
 915  915      ibd_state_s::rc_rcv_alloc_fail
 916  916      ibd_state_s::rc_rcq_err
 917  917      ibd_state_s::rc_ace_not_found
 918  918      ibd_state_s::rc_xmt_drop_too_long_pkt
 919  919      ibd_state_s::rc_xmt_icmp_too_long_pkt
 920  920      ibd_state_s::rc_xmt_reenter_too_long_pkt
 921  921      ibd_state_s::rc_swqe_short
 922  922      ibd_state_s::rc_swqe_mac_update
 923  923      ibd_state_s::rc_xmt_buf_short
 924  924      ibd_state_s::rc_xmt_buf_mac_update
 925  925      ibd_state_s::rc_scq_no_swqe
 926  926      ibd_state_s::rc_scq_no_largebuf
 927  927      ibd_state_s::rc_conn_succ
 928  928      ibd_state_s::rc_conn_fail
 929  929      ibd_state_s::rc_null_conn
 930  930      ibd_state_s::rc_no_estab_conn
 931  931      ibd_state_s::rc_act_close
 932  932      ibd_state_s::rc_pas_close
 933  933      ibd_state_s::rc_delay_ace_recycle
 934  934      ibd_state_s::rc_act_close_simultaneous
 935  935      ibd_state_s::rc_act_close_not_clean
 936  936      ibd_state_s::rc_pas_close_rcq_invoking
 937  937      ibd_state_s::rc_reset_cnt
 938  938      ibd_state_s::rc_timeout_act
 939  939      ibd_state_s::rc_timeout_pas
 940  940      ibd_state_s::rc_stop_connect))
 941  941  
 942  942  #ifdef DEBUG
 943  943  /*
 944  944   * Non-mutex protection schemes for data elements. They are counters
 945  945   * for problem diagnosis. Don't need be protected.
 946  946   */
 947  947  _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
 948  948      ibd_state_s::rc_rwqe_short
 949  949      ibd_rc_stat_s::rc_rcv_trans_byte
 950  950      ibd_rc_stat_s::rc_rcv_trans_pkt
 951  951      ibd_rc_stat_s::rc_rcv_copy_byte
 952  952      ibd_rc_stat_s::rc_rcv_copy_pkt
 953  953      ibd_rc_stat_s::rc_rcv_alloc_fail
 954  954      ibd_rc_stat_s::rc_rcq_err 
 955  955      ibd_rc_stat_s::rc_rwqe_short
 956  956      ibd_rc_stat_s::rc_xmt_bytes
 957  957      ibd_rc_stat_s::rc_xmt_small_pkt
 958  958      ibd_rc_stat_s::rc_xmt_fragmented_pkt
 959  959      ibd_rc_stat_s::rc_xmt_map_fail_pkt
 960  960      ibd_rc_stat_s::rc_xmt_map_succ_pkt
 961  961      ibd_rc_stat_s::rc_ace_not_found
 962  962      ibd_rc_stat_s::rc_scq_no_swqe
 963  963      ibd_rc_stat_s::rc_scq_no_largebuf
 964  964      ibd_rc_stat_s::rc_swqe_short
 965  965      ibd_rc_stat_s::rc_swqe_mac_update
 966  966      ibd_rc_stat_s::rc_xmt_buf_short
 967  967      ibd_rc_stat_s::rc_xmt_buf_mac_update
 968  968      ibd_rc_stat_s::rc_conn_succ
 969  969      ibd_rc_stat_s::rc_conn_fail
 970  970      ibd_rc_stat_s::rc_null_conn
 971  971      ibd_rc_stat_s::rc_no_estab_conn
 972  972      ibd_rc_stat_s::rc_act_close
 973  973      ibd_rc_stat_s::rc_pas_close
 974  974      ibd_rc_stat_s::rc_delay_ace_recycle
 975  975      ibd_rc_stat_s::rc_act_close_simultaneous
 976  976      ibd_rc_stat_s::rc_reset_cnt
 977  977      ibd_rc_stat_s::rc_timeout_act
 978  978      ibd_rc_stat_s::rc_timeout_pas))
 979  979  #endif
 980  980  
 981  981  int
 982  982  _init()
 983  983  {
 984  984          int status;
 985  985  
 986  986          status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
 987  987              PAGESIZE), 0);
 988  988          if (status != 0) {
 989  989                  DPRINT(10, "_init:failed in ddi_soft_state_init()");
 990  990                  return (status);
 991  991          }
 992  992  
 993  993          mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
 994  994  
 995  995          mac_init_ops(&ibd_dev_ops, "ibp");
 996  996          status = mod_install(&ibd_modlinkage);
 997  997          if (status != 0) {
 998  998                  DPRINT(10, "_init:failed in mod_install()");
 999  999                  ddi_soft_state_fini(&ibd_list);
1000 1000                  mac_fini_ops(&ibd_dev_ops);
1001 1001                  return (status);
1002 1002          }
1003 1003  
1004 1004          mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1005 1005          mutex_enter(&ibd_gstate.ig_mutex);
1006 1006          ibd_gstate.ig_ibt_hdl = NULL;
1007 1007          ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1008 1008          ibd_gstate.ig_service_list = NULL;
1009 1009          mutex_exit(&ibd_gstate.ig_mutex);
1010 1010  
1011 1011          if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1012 1012              DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1013 1013                  return (EIO);
1014 1014          }
1015 1015  
1016 1016          ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1017 1017  
1018 1018  #ifdef IBD_LOGGING
1019 1019          ibd_log_init();
1020 1020  #endif
1021 1021          return (0);
1022 1022  }
1023 1023  
1024 1024  int
1025 1025  _info(struct modinfo *modinfop)
1026 1026  {
1027 1027          return (mod_info(&ibd_modlinkage, modinfop));
1028 1028  }
1029 1029  
1030 1030  int
1031 1031  _fini()
1032 1032  {
1033 1033          int status;
1034 1034  
1035 1035          status = mod_remove(&ibd_modlinkage);
1036 1036          if (status != 0)
1037 1037                  return (status);
1038 1038  
1039 1039          ibt_unregister_part_attr_cb();
1040 1040  
1041 1041          mac_fini_ops(&ibd_dev_ops);
1042 1042          mutex_destroy(&ibd_objlist_lock);
1043 1043          ddi_soft_state_fini(&ibd_list);
1044 1044          mutex_destroy(&ibd_gstate.ig_mutex);
1045 1045  #ifdef IBD_LOGGING
1046 1046          ibd_log_fini();
1047 1047  #endif
1048 1048          return (0);
1049 1049  }
1050 1050  
1051 1051  /*
1052 1052   * Convert the GID part of the mac address from network byte order
1053 1053   * to host order.
1054 1054   */
1055 1055  static void
1056 1056  ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1057 1057  {
1058 1058          ib_sn_prefix_t nbopref;
1059 1059          ib_guid_t nboguid;
1060 1060  
1061 1061          bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1062 1062          bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1063 1063          dgid->gid_prefix = b2h64(nbopref);
1064 1064          dgid->gid_guid = b2h64(nboguid);
1065 1065  }
1066 1066  
1067 1067  /*
1068 1068   * Create the IPoIB address in network byte order from host order inputs.
1069 1069   */
1070 1070  static void
1071 1071  ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1072 1072      ib_guid_t guid)
1073 1073  {
1074 1074          ib_sn_prefix_t nbopref;
1075 1075          ib_guid_t nboguid;
1076 1076  
1077 1077          mac->ipoib_qpn = htonl(qpn);
1078 1078          nbopref = h2b64(prefix);
1079 1079          nboguid = h2b64(guid);
1080 1080          bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1081 1081          bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1082 1082  }
1083 1083  
1084 1084  /*
1085 1085   * Send to the appropriate all-routers group when the IBA multicast group
1086 1086   * does not exist, based on whether the target group is v4 or v6.
1087 1087   */
1088 1088  static boolean_t
1089 1089  ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1090 1090      ipoib_mac_t *rmac)
1091 1091  {
1092 1092          boolean_t retval = B_TRUE;
1093 1093          uint32_t adjscope = state->id_scope << 16;
1094 1094          uint32_t topword;
1095 1095  
1096 1096          /*
1097 1097           * Copy the first 4 bytes in without assuming any alignment of
1098 1098           * input mac address; this will have IPoIB signature, flags and
1099 1099           * scope bits.
1100 1100           */
1101 1101          bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1102 1102          topword = ntohl(topword);
1103 1103  
1104 1104          /*
1105 1105           * Generate proper address for IPv4/v6, adding in the Pkey properly.
1106 1106           */
1107 1107          if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1108 1108              (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1109 1109                  ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1110 1110                      ((uint32_t)(state->id_pkey << 16))),
1111 1111                      (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1112 1112          else
1113 1113                  /*
1114 1114                   * Does not have proper bits in the mgid address.
1115 1115                   */
1116 1116                  retval = B_FALSE;
1117 1117  
1118 1118          return (retval);
1119 1119  }
1120 1120  
1121 1121  /*
1122 1122   * Membership states for different mcg's are tracked by two lists:
1123 1123   * the "non" list is used for promiscuous mode, when all mcg traffic
1124 1124   * needs to be inspected. This type of membership is never used for
1125 1125   * transmission, so there can not be an AH in the active list
1126 1126   * corresponding to a member in this list. This list does not need
1127 1127   * any protection, since all operations are performed by the async
1128 1128   * thread.
1129 1129   *
1130 1130   * "Full" and "SendOnly" membership is tracked using a single list,
1131 1131   * the "full" list. This is because this single list can then be
1132 1132   * searched during transmit to a multicast group (if an AH for the
1133 1133   * mcg is not found in the active list), since at least one type
1134 1134   * of membership must be present before initiating the transmit.
1135 1135   * This list is also emptied during driver detach, since sendonly
1136 1136   * membership acquired during transmit is dropped at detach time
1137 1137   * along with ipv4 broadcast full membership. Insert/deletes to
1138 1138   * this list are done only by the async thread, but it is also
1139 1139   * searched in program context (see multicast disable case), thus
1140 1140   * the id_mc_mutex protects the list. The driver detach path also
1141 1141   * deconstructs the "full" list, but it ensures that the async
1142 1142   * thread will not be accessing the list (by blocking out mcg
1143 1143   * trap handling and making sure no more Tx reaping will happen).
1144 1144   *
1145 1145   * Currently, an IBA attach is done in the SendOnly case too,
1146 1146   * although this is not required.
1147 1147   */
1148 1148  #define IBD_MCACHE_INSERT_FULL(state, mce) \
1149 1149          list_insert_head(&state->id_mc_full, mce)
1150 1150  #define IBD_MCACHE_INSERT_NON(state, mce) \
1151 1151          list_insert_head(&state->id_mc_non, mce)
1152 1152  #define IBD_MCACHE_FIND_FULL(state, mgid) \
1153 1153          ibd_mcache_find(mgid, &state->id_mc_full)
1154 1154  #define IBD_MCACHE_FIND_NON(state, mgid) \
1155 1155          ibd_mcache_find(mgid, &state->id_mc_non)
1156 1156  #define IBD_MCACHE_PULLOUT_FULL(state, mce) \
1157 1157          list_remove(&state->id_mc_full, mce)
1158 1158  #define IBD_MCACHE_PULLOUT_NON(state, mce) \
1159 1159          list_remove(&state->id_mc_non, mce)
1160 1160  
1161 1161  static void *
1162 1162  list_get_head(list_t *list)
1163 1163  {
1164 1164          list_node_t *lhead = list_head(list);
1165 1165  
1166 1166          if (lhead != NULL)
1167 1167                  list_remove(list, lhead);
1168 1168          return (lhead);
1169 1169  }
1170 1170  
1171 1171  /*
1172 1172   * This is always guaranteed to be able to queue the work.
1173 1173   */
1174 1174  void
1175 1175  ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1176 1176  {
1177 1177          /* Initialize request */
1178 1178          DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1179 1179          ptr->rq_op = op;
1180 1180  
1181 1181          /*
1182 1182           * Queue provided slot onto request pool.
1183 1183           */
1184 1184          mutex_enter(&state->id_acache_req_lock);
1185 1185          list_insert_tail(&state->id_req_list, ptr);
1186 1186  
1187 1187          /* Go, fetch, async thread */
1188 1188          cv_signal(&state->id_acache_req_cv);
1189 1189          mutex_exit(&state->id_acache_req_lock);
1190 1190  }
1191 1191  
1192 1192  /*
1193 1193   * Main body of the per interface async thread.
1194 1194   */
1195 1195  static void
1196 1196  ibd_async_work(ibd_state_t *state)
1197 1197  {
1198 1198          ibd_req_t *ptr;
1199 1199          callb_cpr_t cprinfo;
1200 1200  
1201 1201          mutex_enter(&state->id_acache_req_lock);
1202 1202          CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1203 1203              callb_generic_cpr, "ibd_async_work");
1204 1204  
1205 1205          for (;;) {
1206 1206                  ptr = list_get_head(&state->id_req_list);
1207 1207                  if (ptr != NULL) {
1208 1208                          mutex_exit(&state->id_acache_req_lock);
1209 1209  
1210 1210                          /*
1211 1211                           * If we are in late hca initialization mode, do not
1212 1212                           * process any other async request other than TRAP. TRAP
1213 1213                           * is used for indicating creation of a broadcast group;
1214 1214                           * in which case, we need to join/create the group.
1215 1215                           */
1216 1216                          if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1217 1217                              (ptr->rq_op != IBD_ASYNC_TRAP)) {
1218 1218                                  goto free_req_and_continue;
1219 1219                          }
1220 1220  
1221 1221                          /*
1222 1222                           * Once we have done the operation, there is no
1223 1223                           * guarantee the request slot is going to be valid,
1224 1224                           * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1225 1225                           * TRAP).
1226 1226                           *
1227 1227                           * Perform the request.
1228 1228                           */
1229 1229                          switch (ptr->rq_op) {
1230 1230                                  case IBD_ASYNC_GETAH:
1231 1231                                          ibd_async_acache(state, &ptr->rq_mac);
1232 1232                                          break;
1233 1233                                  case IBD_ASYNC_JOIN:
1234 1234                                  case IBD_ASYNC_LEAVE:
1235 1235                                          ibd_async_multicast(state,
1236 1236                                              ptr->rq_gid, ptr->rq_op);
1237 1237                                          break;
1238 1238                                  case IBD_ASYNC_PROMON:
1239 1239                                          ibd_async_setprom(state);
1240 1240                                          break;
1241 1241                                  case IBD_ASYNC_PROMOFF:
1242 1242                                          ibd_async_unsetprom(state);
1243 1243                                          break;
1244 1244                                  case IBD_ASYNC_REAP:
1245 1245                                          ibd_async_reap_group(state,
1246 1246                                              ptr->rq_ptr, ptr->rq_gid,
1247 1247                                              IB_MC_JSTATE_FULL);
1248 1248                                          /*
1249 1249                                           * the req buf contains in mce
1250 1250                                           * structure, so we do not need
1251 1251                                           * to free it here.
1252 1252                                           */
1253 1253                                          ptr = NULL;
1254 1254                                          break;
1255 1255                                  case IBD_ASYNC_TRAP:
1256 1256                                          ibd_async_trap(state, ptr);
1257 1257                                          break;
1258 1258                                  case IBD_ASYNC_SCHED:
1259 1259                                          ibd_async_txsched(state);
1260 1260                                          break;
1261 1261                                  case IBD_ASYNC_LINK:
1262 1262                                          ibd_async_link(state, ptr);
1263 1263                                          break;
1264 1264                                  case IBD_ASYNC_EXIT:
1265 1265                                          mutex_enter(&state->id_acache_req_lock);
1266 1266  #ifndef __lock_lint
1267 1267                                          CALLB_CPR_EXIT(&cprinfo);
1268 1268  #else
1269 1269                                          mutex_exit(&state->id_acache_req_lock);
1270 1270  #endif
1271 1271                                          return;
1272 1272                                  case IBD_ASYNC_RC_TOO_BIG:
1273 1273                                          ibd_async_rc_process_too_big(state,
1274 1274                                              ptr);
1275 1275                                          break;
1276 1276                                  case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1277 1277                                          ibd_async_rc_close_act_chan(state, ptr);
1278 1278                                          break;
1279 1279                                  case IBD_ASYNC_RC_RECYCLE_ACE:
1280 1280                                          ibd_async_rc_recycle_ace(state, ptr);
1281 1281                                          break;
1282 1282                                  case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
1283 1283                                          (void) ibd_rc_pas_close(ptr->rq_ptr,
1284 1284                                              B_TRUE, B_TRUE);
1285 1285                                          break;
1286 1286                          }
1287 1287  free_req_and_continue:
1288 1288                          if (ptr != NULL)
1289 1289                                  kmem_cache_free(state->id_req_kmc, ptr);
1290 1290  
1291 1291                          mutex_enter(&state->id_acache_req_lock);
1292 1292                  } else {
1293 1293  #ifndef __lock_lint
1294 1294                          /*
1295 1295                           * Nothing to do: wait till new request arrives.
1296 1296                           */
1297 1297                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298 1298                          cv_wait(&state->id_acache_req_cv,
1299 1299                              &state->id_acache_req_lock);
1300 1300                          CALLB_CPR_SAFE_END(&cprinfo,
1301 1301                              &state->id_acache_req_lock);
1302 1302  #endif
1303 1303                  }
1304 1304          }
1305 1305  
1306 1306          /*NOTREACHED*/
1307 1307          _NOTE(NOT_REACHED)
1308 1308  }
1309 1309  
1310 1310  /*
1311 1311   * Return when it is safe to queue requests to the async daemon; primarily
1312 1312   * for subnet trap and async event handling. Disallow requests before the
1313 1313   * daemon is created, and when interface deinitilization starts.
1314 1314   */
1315 1315  static boolean_t
1316 1316  ibd_async_safe(ibd_state_t *state)
1317 1317  {
1318 1318          mutex_enter(&state->id_trap_lock);
1319 1319          if (state->id_trap_stop) {
1320 1320                  mutex_exit(&state->id_trap_lock);
1321 1321                  return (B_FALSE);
1322 1322          }
1323 1323          state->id_trap_inprog++;
1324 1324          mutex_exit(&state->id_trap_lock);
1325 1325          return (B_TRUE);
1326 1326  }
1327 1327  
1328 1328  /*
1329 1329   * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1330 1330   * trap or event handling to complete to kill the async thread and deconstruct
1331 1331   * the mcg/ace list.
1332 1332   */
1333 1333  static void
1334 1334  ibd_async_done(ibd_state_t *state)
1335 1335  {
1336 1336          mutex_enter(&state->id_trap_lock);
1337 1337          if (--state->id_trap_inprog == 0)
1338 1338                  cv_signal(&state->id_trap_cv);
1339 1339          mutex_exit(&state->id_trap_lock);
1340 1340  }
1341 1341  
1342 1342  /*
1343 1343   * Hash functions:
1344 1344   * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1345 1345   * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1346 1346   * These operate on mac addresses input into ibd_send, but there is no
1347 1347   * guarantee on the alignment of the ipoib_mac_t structure.
1348 1348   */
1349 1349  /*ARGSUSED*/
1350 1350  static uint_t
1351 1351  ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1352 1352  {
1353 1353          ulong_t ptraddr = (ulong_t)key;
1354 1354          uint_t hval;
1355 1355  
1356 1356          /*
1357 1357           * If the input address is 4 byte aligned, we can just dereference
1358 1358           * it. This is most common, since IP will send in a 4 byte aligned
1359 1359           * IP header, which implies the 24 byte IPoIB psuedo header will be
1360 1360           * 4 byte aligned too.
1361 1361           */
1362 1362          if ((ptraddr & 3) == 0)
1363 1363                  return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1364 1364  
1365 1365          bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1366 1366          return (hval);
1367 1367  }
1368 1368  
1369 1369  static int
1370 1370  ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1371 1371  {
1372 1372          if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1373 1373                  return (0);
1374 1374          else
1375 1375                  return (1);
1376 1376  }
1377 1377  
1378 1378  /*
1379 1379   * Initialize all the per interface caches and lists; AH cache,
1380 1380   * MCG list etc.
1381 1381   */
1382 1382  static int
1383 1383  ibd_acache_init(ibd_state_t *state)
1384 1384  {
1385 1385          ibd_ace_t *ce;
1386 1386          int i;
1387 1387  
1388 1388          mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1389 1389          mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1390 1390          mutex_enter(&state->id_ac_mutex);
1391 1391          list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1392 1392              offsetof(ibd_ace_t, ac_list));
1393 1393          list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1394 1394              offsetof(ibd_ace_t, ac_list));
1395 1395          state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1396 1396              state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1397 1397              ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1398 1398          list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1399 1399              offsetof(ibd_mce_t, mc_list));
1400 1400          list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1401 1401              offsetof(ibd_mce_t, mc_list));
1402 1402          state->id_ac_hot_ace = NULL;
1403 1403  
1404 1404          state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1405 1405              state->id_num_ah, KM_SLEEP);
1406 1406          for (i = 0; i < state->id_num_ah; i++, ce++) {
1407 1407                  if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1408 1408                      state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1409 1409                          mutex_exit(&state->id_ac_mutex);
1410 1410                          ibd_acache_fini(state);
1411 1411                          return (DDI_FAILURE);
1412 1412                  } else {
1413 1413                          CLEAR_REFCYCLE(ce);
1414 1414                          ce->ac_mce = NULL;
1415 1415                          mutex_init(&ce->tx_too_big_mutex, NULL,
1416 1416                              MUTEX_DRIVER, NULL);
1417 1417                          IBD_ACACHE_INSERT_FREE(state, ce);
1418 1418                  }
1419 1419          }
1420 1420          mutex_exit(&state->id_ac_mutex);
1421 1421          return (DDI_SUCCESS);
1422 1422  }
1423 1423  
1424 1424  static void
1425 1425  ibd_acache_fini(ibd_state_t *state)
1426 1426  {
1427 1427          ibd_ace_t *ptr;
1428 1428  
1429 1429          mutex_enter(&state->id_ac_mutex);
1430 1430  
1431 1431          while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1432 1432                  ASSERT(GET_REF(ptr) == 0);
1433 1433                  mutex_destroy(&ptr->tx_too_big_mutex);
1434 1434                  (void) ibt_free_ud_dest(ptr->ac_dest);
1435 1435          }
1436 1436  
1437 1437          while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1438 1438                  ASSERT(GET_REF(ptr) == 0);
1439 1439                  mutex_destroy(&ptr->tx_too_big_mutex);
1440 1440                  (void) ibt_free_ud_dest(ptr->ac_dest);
1441 1441          }
1442 1442  
1443 1443          list_destroy(&state->id_ah_free);
1444 1444          list_destroy(&state->id_ah_active);
1445 1445          list_destroy(&state->id_mc_full);
1446 1446          list_destroy(&state->id_mc_non);
1447 1447          kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1448 1448          mutex_exit(&state->id_ac_mutex);
1449 1449          mutex_destroy(&state->id_ac_mutex);
1450 1450          mutex_destroy(&state->id_mc_mutex);
1451 1451  }
1452 1452  
1453 1453  /*
1454 1454   * Search AH active hash list for a cached path to input destination.
1455 1455   * If we are "just looking", hold == F. When we are in the Tx path,
1456 1456   * we set hold == T to grab a reference on the AH so that it can not
1457 1457   * be recycled to a new destination while the Tx request is posted.
1458 1458   */
1459 1459  ibd_ace_t *
1460 1460  ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1461 1461  {
1462 1462          ibd_ace_t *ptr;
1463 1463  
1464 1464          ASSERT(mutex_owned(&state->id_ac_mutex));
1465 1465  
1466 1466          /*
1467 1467           * Do hash search.
1468 1468           */
1469 1469          if (mod_hash_find(state->id_ah_active_hash,
1470 1470              (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1471 1471                  if (hold)
1472 1472                          INC_REF(ptr, num);
1473 1473                  return (ptr);
1474 1474          }
1475 1475          return (NULL);
1476 1476  }
1477 1477  
1478 1478  /*
1479 1479   * This is called by the tx side; if an initialized AH is found in
1480 1480   * the active list, it is locked down and can be used; if no entry
1481 1481   * is found, an async request is queued to do path resolution.
1482 1482   */
1483 1483  static ibd_ace_t *
1484 1484  ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1485 1485  {
1486 1486          ibd_ace_t *ptr;
1487 1487          ibd_req_t *req;
1488 1488  
1489 1489          /*
1490 1490           * Only attempt to print when we can; in the mdt pattr case, the
1491 1491           * address is not aligned properly.
1492 1492           */
1493 1493          if (((ulong_t)mac & 3) == 0) {
1494 1494                  DPRINT(4,
1495 1495                      "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1496 1496                      htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1497 1497                      htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1498 1498                      htonl(mac->ipoib_gidsuff[1]));
1499 1499          }
1500 1500  
1501 1501          mutex_enter(&state->id_ac_mutex);
1502 1502  
1503 1503          if (((ptr = state->id_ac_hot_ace) != NULL) &&
1504 1504              (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1505 1505                  INC_REF(ptr, numwqe);
1506 1506                  mutex_exit(&state->id_ac_mutex);
1507 1507                  return (ptr);
1508 1508          }
1509 1509          if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1510 1510                  state->id_ac_hot_ace = ptr;
1511 1511                  mutex_exit(&state->id_ac_mutex);
1512 1512                  return (ptr);
1513 1513          }
1514 1514  
1515 1515          /*
1516 1516           * Implementation of a single outstanding async request; if
1517 1517           * the operation is not started yet, queue a request and move
1518 1518           * to ongoing state. Remember in id_ah_addr for which address
1519 1519           * we are queueing the request, in case we need to flag an error;
1520 1520           * Any further requests, for the same or different address, until
1521 1521           * the operation completes, is sent back to GLDv3 to be retried.
1522 1522           * The async thread will update id_ah_op with an error indication
1523 1523           * or will set it to indicate the next look up can start; either
1524 1524           * way, it will mac_tx_update() so that all blocked requests come
1525 1525           * back here.
1526 1526           */
1527 1527          *err = EAGAIN;
1528 1528          if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1529 1529                  req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1530 1530                  if (req != NULL) {
1531 1531                          /*
1532 1532                           * We did not even find the entry; queue a request
1533 1533                           * for it.
1534 1534                           */
1535 1535                          bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1536 1536                          state->id_ah_op = IBD_OP_ONGOING;
1537 1537                          ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1538 1538                          bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1539 1539                  }
1540 1540          } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1541 1541              (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1542 1542                  /*
1543 1543                   * Check the status of the pathrecord lookup request
1544 1544                   * we had queued before.
1545 1545                   */
1546 1546                  if (state->id_ah_op == IBD_OP_ERRORED) {
1547 1547                          *err = EFAULT;
1548 1548                          state->id_ah_error++;
1549 1549                  } else {
1550 1550                          /*
1551 1551                           * IBD_OP_ROUTERED case: We need to send to the
1552 1552                           * all-router MCG. If we can find the AH for
1553 1553                           * the mcg, the Tx will be attempted. If we
1554 1554                           * do not find the AH, we return NORESOURCES
1555 1555                           * to retry.
1556 1556                           */
1557 1557                          ipoib_mac_t routermac;
1558 1558  
1559 1559                          (void) ibd_get_allroutergroup(state, mac, &routermac);
1560 1560                          ptr = ibd_acache_find(state, &routermac, B_TRUE,
1561 1561                              numwqe);
1562 1562                  }
1563 1563                  state->id_ah_op = IBD_OP_NOTSTARTED;
1564 1564          } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1565 1565              (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1566 1566                  /*
1567 1567                   * This case can happen when we get a higher band
1568 1568                   * packet. The easiest way is to reset the state machine
1569 1569                   * to accommodate the higher priority packet.
1570 1570                   */
1571 1571                  state->id_ah_op = IBD_OP_NOTSTARTED;
1572 1572          }
1573 1573          mutex_exit(&state->id_ac_mutex);
1574 1574  
1575 1575          return (ptr);
1576 1576  }
1577 1577  
1578 1578  /*
1579 1579   * Grab a not-currently-in-use AH/PathRecord from the active
1580 1580   * list to recycle to a new destination. Only the async thread
1581 1581   * executes this code.
1582 1582   */
1583 1583  static ibd_ace_t *
1584 1584  ibd_acache_get_unref(ibd_state_t *state)
1585 1585  {
1586 1586          ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1587 1587          boolean_t try_rc_chan_recycle = B_FALSE;
1588 1588  
1589 1589          ASSERT(mutex_owned(&state->id_ac_mutex));
1590 1590  
1591 1591          /*
1592 1592           * Do plain linear search.
1593 1593           */
1594 1594          while (ptr != NULL) {
1595 1595                  /*
1596 1596                   * Note that it is possible that the "cycle" bit
1597 1597                   * is set on the AH w/o any reference count. The
1598 1598                   * mcg must have been deleted, and the tx cleanup
1599 1599                   * just decremented the reference count to 0, but
1600 1600                   * hasn't gotten around to grabbing the id_ac_mutex
1601 1601                   * to move the AH into the free list.
1602 1602                   */
1603 1603                  if (GET_REF(ptr) == 0) {
1604 1604                          if (ptr->ac_chan != NULL) {
1605 1605                                  ASSERT(state->id_enable_rc == B_TRUE);
1606 1606                                  if (!try_rc_chan_recycle) {
1607 1607                                          try_rc_chan_recycle = B_TRUE;
1608 1608                                          ibd_rc_signal_ace_recycle(state, ptr);
1609 1609                                  }
1610 1610                          } else {
1611 1611                                  IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1612 1612                                  break;
1613 1613                          }
1614 1614                  }
1615 1615                  ptr = list_prev(&state->id_ah_active, ptr);
1616 1616          }
1617 1617          return (ptr);
1618 1618  }
1619 1619  
1620 1620  /*
1621 1621   * Invoked to clean up AH from active list in case of multicast
1622 1622   * disable and to handle sendonly memberships during mcg traps.
1623 1623   * And for port up processing for multicast and unicast AHs.
1624 1624   * Normally, the AH is taken off the active list, and put into
1625 1625   * the free list to be recycled for a new destination. In case
1626 1626   * Tx requests on the AH have not completed yet, the AH is marked
1627 1627   * for reaping (which will put the AH on the free list) once the Tx's
1628 1628   * complete; in this case, depending on the "force" input, we take
1629 1629   * out the AH from the active list right now, or leave it also for
1630 1630   * the reap operation. Returns TRUE if the AH is taken off the active
1631 1631   * list (and either put into the free list right now, or arranged for
1632 1632   * later), FALSE otherwise.
1633 1633   */
1634 1634  boolean_t
1635 1635  ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1636 1636  {
1637 1637          ibd_ace_t *acactive;
1638 1638          boolean_t ret = B_TRUE;
1639 1639  
1640 1640          ASSERT(mutex_owned(&state->id_ac_mutex));
1641 1641  
1642 1642          if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1643 1643  
1644 1644                  /*
1645 1645                   * Note that the AH might already have the cycle bit set
1646 1646                   * on it; this might happen if sequences of multicast
1647 1647                   * enables and disables are coming so fast, that posted
1648 1648                   * Tx's to the mcg have not completed yet, and the cycle
1649 1649                   * bit is set successively by each multicast disable.
1650 1650                   */
1651 1651                  if (SET_CYCLE_IF_REF(acactive)) {
1652 1652                          if (!force) {
1653 1653                                  /*
1654 1654                                   * The ace is kept on the active list, further
1655 1655                                   * Tx's can still grab a reference on it; the
1656 1656                                   * ace is reaped when all pending Tx's
1657 1657                                   * referencing the AH complete.
1658 1658                                   */
1659 1659                                  ret = B_FALSE;
1660 1660                          } else {
1661 1661                                  /*
1662 1662                                   * In the mcg trap case, we always pull the
1663 1663                                   * AH from the active list. And also the port
1664 1664                                   * up multi/unicast case.
1665 1665                                   */
1666 1666                                  ASSERT(acactive->ac_chan == NULL);
1667 1667                                  IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1668 1668                                  acactive->ac_mce = NULL;
1669 1669                          }
1670 1670                  } else {
1671 1671                          /*
1672 1672                           * Determined the ref count is 0, thus reclaim
1673 1673                           * immediately after pulling out the ace from
1674 1674                           * the active list.
1675 1675                           */
1676 1676                          ASSERT(acactive->ac_chan == NULL);
1677 1677                          IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1678 1678                          acactive->ac_mce = NULL;
1679 1679                          IBD_ACACHE_INSERT_FREE(state, acactive);
1680 1680                  }
1681 1681  
1682 1682          }
1683 1683          return (ret);
1684 1684  }
1685 1685  
1686 1686  /*
1687 1687   * Helper function for async path record lookup. If we are trying to
1688 1688   * Tx to a MCG, check our membership, possibly trying to join the
1689 1689   * group if required. If that fails, try to send the packet to the
1690 1690   * all router group (indicated by the redirect output), pointing
1691 1691   * the input mac address to the router mcg address.
1692 1692   */
1693 1693  static ibd_mce_t *
1694 1694  ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1695 1695  {
1696 1696          ib_gid_t mgid;
1697 1697          ibd_mce_t *mce;
1698 1698          ipoib_mac_t routermac;
1699 1699  
1700 1700          *redirect = B_FALSE;
1701 1701          ibd_n2h_gid(mac, &mgid);
1702 1702  
1703 1703          /*
1704 1704           * Check the FullMember+SendOnlyNonMember list.
1705 1705           * Since we are the only one who manipulates the
1706 1706           * id_mc_full list, no locks are needed.
1707 1707           */
1708 1708          mce = IBD_MCACHE_FIND_FULL(state, mgid);
1709 1709          if (mce != NULL) {
1710 1710                  DPRINT(4, "ibd_async_mcache : already joined to group");
1711 1711                  return (mce);
1712 1712          }
1713 1713  
1714 1714          /*
1715 1715           * Not found; try to join(SendOnlyNonMember) and attach.
1716 1716           */
1717 1717          DPRINT(4, "ibd_async_mcache : not joined to group");
1718 1718          if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1719 1719              NULL) {
1720 1720                  DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1721 1721                  return (mce);
1722 1722          }
1723 1723  
1724 1724          /*
1725 1725           * MCGroup not present; try to join the all-router group. If
1726 1726           * any of the following steps succeed, we will be redirecting
1727 1727           * to the all router group.
1728 1728           */
1729 1729          DPRINT(4, "ibd_async_mcache : nonmem join failed");
1730 1730          if (!ibd_get_allroutergroup(state, mac, &routermac))
1731 1731                  return (NULL);
1732 1732          *redirect = B_TRUE;
1733 1733          ibd_n2h_gid(&routermac, &mgid);
1734 1734          bcopy(&routermac, mac, IPOIB_ADDRL);
1735 1735          DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1736 1736              mgid.gid_prefix, mgid.gid_guid);
1737 1737  
1738 1738          /*
1739 1739           * Are we already joined to the router group?
1740 1740           */
1741 1741          if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1742 1742                  DPRINT(4, "ibd_async_mcache : using already joined router"
1743 1743                      "group\n");
1744 1744                  return (mce);
1745 1745          }
1746 1746  
1747 1747          /*
1748 1748           * Can we join(SendOnlyNonMember) the router group?
1749 1749           */
1750 1750          DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1751 1751          if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1752 1752              NULL) {
1753 1753                  DPRINT(4, "ibd_async_mcache : joined to router grp");
1754 1754                  return (mce);
1755 1755          }
1756 1756  
1757 1757          return (NULL);
1758 1758  }
1759 1759  
1760 1760  /*
1761 1761   * Async path record lookup code.
1762 1762   */
1763 1763  static void
1764 1764  ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1765 1765  {
1766 1766          ibd_ace_t *ce;
1767 1767          ibd_mce_t *mce = NULL;
1768 1768          ibt_path_attr_t path_attr;
1769 1769          ibt_path_info_t path_info;
1770 1770          ib_gid_t destgid;
1771 1771          char ret = IBD_OP_NOTSTARTED;
1772 1772  
1773 1773          DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1774 1774              htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1775 1775              htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1776 1776              htonl(mac->ipoib_gidsuff[1]));
1777 1777  
1778 1778          /*
1779 1779           * Check whether we are trying to transmit to a MCG.
1780 1780           * In that case, we need to make sure we are a member of
1781 1781           * the MCG.
1782 1782           */
1783 1783          if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1784 1784                  boolean_t redirected;
1785 1785  
1786 1786                  /*
1787 1787                   * If we can not find or join the group or even
1788 1788                   * redirect, error out.
1789 1789                   */
1790 1790                  if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1791 1791                      NULL) {
1792 1792                          state->id_ah_op = IBD_OP_ERRORED;
1793 1793                          return;
1794 1794                  }
1795 1795  
1796 1796                  /*
1797 1797                   * If we got redirected, we need to determine whether
1798 1798                   * the AH for the new mcg is in the cache already, and
1799 1799                   * not pull it in then; otherwise proceed to get the
1800 1800                   * path for the new mcg. There is no guarantee that
1801 1801                   * if the AH is currently in the cache, it will still be
1802 1802                   * there when we look in ibd_acache_lookup(), but that's
1803 1803                   * okay, we will come back here.
1804 1804                   */
1805 1805                  if (redirected) {
1806 1806                          ret = IBD_OP_ROUTERED;
1807 1807                          DPRINT(4, "ibd_async_acache :  redirected to "
1808 1808                              "%08X:%08X:%08X:%08X:%08X",
1809 1809                              htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1810 1810                              htonl(mac->ipoib_gidpref[1]),
1811 1811                              htonl(mac->ipoib_gidsuff[0]),
1812 1812                              htonl(mac->ipoib_gidsuff[1]));
1813 1813  
1814 1814                          mutex_enter(&state->id_ac_mutex);
1815 1815                          if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1816 1816                                  state->id_ah_op = IBD_OP_ROUTERED;
1817 1817                                  mutex_exit(&state->id_ac_mutex);
1818 1818                                  DPRINT(4, "ibd_async_acache : router AH found");
1819 1819                                  return;
1820 1820                          }
1821 1821                          mutex_exit(&state->id_ac_mutex);
1822 1822                  }
1823 1823          }
1824 1824  
1825 1825          /*
1826 1826           * Get an AH from the free list.
1827 1827           */
1828 1828          mutex_enter(&state->id_ac_mutex);
1829 1829          if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1830 1830                  /*
1831 1831                   * No free ones; try to grab an unreferenced active
1832 1832                   * one. Maybe we need to make the active list LRU,
1833 1833                   * but that will create more work for Tx callbacks.
1834 1834                   * Is there a way of not having to pull out the
1835 1835                   * entry from the active list, but just indicate it
1836 1836                   * is being recycled? Yes, but that creates one more
1837 1837                   * check in the fast lookup path.
1838 1838                   */
1839 1839                  if ((ce = ibd_acache_get_unref(state)) == NULL) {
1840 1840                          /*
1841 1841                           * Pretty serious shortage now.
1842 1842                           */
1843 1843                          state->id_ah_op = IBD_OP_NOTSTARTED;
1844 1844                          mutex_exit(&state->id_ac_mutex);
1845 1845                          DPRINT(10, "ibd_async_acache : failed to find AH "
1846 1846                              "slot\n");
1847 1847                          return;
1848 1848                  }
1849 1849                  /*
1850 1850                   * We could check whether ac_mce points to a SendOnly
1851 1851                   * member and drop that membership now. Or do it lazily
1852 1852                   * at detach time.
1853 1853                   */
1854 1854                  ce->ac_mce = NULL;
1855 1855          }
1856 1856          mutex_exit(&state->id_ac_mutex);
1857 1857          ASSERT(ce->ac_mce == NULL);
1858 1858  
1859 1859          /*
1860 1860           * Update the entry.
1861 1861           */
1862 1862          bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1863 1863  
1864 1864          bzero(&path_info, sizeof (path_info));
1865 1865          bzero(&path_attr, sizeof (ibt_path_attr_t));
1866 1866          path_attr.pa_sgid = state->id_sgid;
1867 1867          path_attr.pa_num_dgids = 1;
1868 1868          ibd_n2h_gid(&ce->ac_mac, &destgid);
1869 1869          path_attr.pa_dgids = &destgid;
1870 1870          path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1871 1871          path_attr.pa_pkey = state->id_pkey;
1872 1872          if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1873 1873              &path_info, NULL) != IBT_SUCCESS) {
1874 1874                  DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1875 1875                  goto error;
1876 1876          }
1877 1877          if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1878 1878              ntohl(ce->ac_mac.ipoib_qpn),
1879 1879              &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1880 1880                  DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1881 1881                  goto error;
1882 1882          }
1883 1883  
1884 1884          /*
1885 1885           * mce is set whenever an AH is being associated with a
1886 1886           * MCG; this will come in handy when we leave the MCG. The
1887 1887           * lock protects Tx fastpath from scanning the active list.
1888 1888           */
1889 1889          if (mce != NULL)
1890 1890                  ce->ac_mce = mce;
1891 1891  
1892 1892          /*
1893 1893           * initiate a RC mode connection for unicast address
1894 1894           */
1895 1895          if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1896 1896              (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1897 1897                  ASSERT(ce->ac_chan == NULL);
1898 1898                  DPRINT(10, "ibd_async_acache: call "
1899 1899                      "ibd_rc_try_connect(ace=%p)", ce);
1900 1900                  ibd_rc_try_connect(state, ce, &path_info);
1901 1901                  if (ce->ac_chan == NULL) {
1902 1902                          DPRINT(10, "ibd_async_acache: fail to setup RC"
1903 1903                              " channel");
1904 1904                          state->rc_conn_fail++;
1905 1905                          goto error;
1906 1906                  }
1907 1907          }
1908 1908  
1909 1909          mutex_enter(&state->id_ac_mutex);
1910 1910          IBD_ACACHE_INSERT_ACTIVE(state, ce);
1911 1911          state->id_ah_op = ret;
1912 1912          mutex_exit(&state->id_ac_mutex);
1913 1913          return;
1914 1914  error:
1915 1915          /*
1916 1916           * We might want to drop SendOnly membership here if we
1917 1917           * joined above. The lock protects Tx callbacks inserting
1918 1918           * into the free list.
1919 1919           */
1920 1920          mutex_enter(&state->id_ac_mutex);
1921 1921          state->id_ah_op = IBD_OP_ERRORED;
1922 1922          IBD_ACACHE_INSERT_FREE(state, ce);
1923 1923          mutex_exit(&state->id_ac_mutex);
1924 1924  }
1925 1925  
1926 1926  /*
1927 1927   * While restoring port's presence on the subnet on a port up, it is possible
1928 1928   * that the port goes down again.
1929 1929   */
1930 1930  static void
1931 1931  ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1932 1932  {
1933 1933          ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1934 1934          link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1935 1935              LINK_STATE_UP;
1936 1936          ibd_mce_t *mce, *pmce;
1937 1937          ibd_ace_t *ace, *pace;
1938 1938  
1939 1939          DPRINT(10, "ibd_async_link(): %d", opcode);
1940 1940  
1941 1941          /*
1942 1942           * On a link up, revalidate the link speed/width. No point doing
1943 1943           * this on a link down, since we will be unable to do SA operations,
1944 1944           * defaulting to the lowest speed. Also notice that we update our
1945 1945           * notion of speed before calling mac_link_update(), which will do
1946 1946           * necessary higher level notifications for speed changes.
1947 1947           */
1948 1948          if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1949 1949                  _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1950 1950                  state->id_link_speed = ibd_get_portspeed(state);
1951 1951                  _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1952 1952          }
1953 1953  
1954 1954          /*
1955 1955           * Do all the work required to establish our presence on
1956 1956           * the subnet.
1957 1957           */
1958 1958          if (opcode == IBD_LINK_UP_ABSENT) {
1959 1959                  /*
1960 1960                   * If in promiscuous mode ...
1961 1961                   */
1962 1962                  if (state->id_prom_op == IBD_OP_COMPLETED) {
1963 1963                          /*
1964 1964                           * Drop all nonmembership.
1965 1965                           */
1966 1966                          ibd_async_unsetprom(state);
1967 1967  
1968 1968                          /*
1969 1969                           * Then, try to regain nonmembership to all mcg's.
1970 1970                           */
1971 1971                          ibd_async_setprom(state);
1972 1972  
1973 1973                  }
1974 1974  
1975 1975                  /*
1976 1976                   * Drop all sendonly membership (which also gets rid of the
1977 1977                   * AHs); try to reacquire all full membership.
1978 1978                   */
1979 1979                  mce = list_head(&state->id_mc_full);
1980 1980                  while ((pmce = mce) != NULL) {
1981 1981                          mce = list_next(&state->id_mc_full, mce);
1982 1982                          if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1983 1983                                  ibd_leave_group(state,
1984 1984                                      pmce->mc_info.mc_adds_vect.av_dgid,
1985 1985                                      IB_MC_JSTATE_SEND_ONLY_NON);
1986 1986                          else
1987 1987                                  ibd_reacquire_group(state, pmce);
1988 1988                  }
1989 1989  
1990 1990                  /*
1991 1991                   * Recycle all active AHs to free list (and if there are
1992 1992                   * pending posts, make sure they will go into the free list
1993 1993                   * once the Tx's complete). Grab the lock to prevent
1994 1994                   * concurrent Tx's as well as Tx cleanups.
1995 1995                   */
1996 1996                  mutex_enter(&state->id_ac_mutex);
1997 1997                  ace = list_head(&state->id_ah_active);
1998 1998                  while ((pace = ace) != NULL) {
1999 1999                          boolean_t cycled;
2000 2000  
2001 2001                          ace = list_next(&state->id_ah_active, ace);
2002 2002                          mce = pace->ac_mce;
2003 2003                          if (pace->ac_chan != NULL) {
2004 2004                                  ASSERT(mce == NULL);
2005 2005                                  ASSERT(state->id_enable_rc == B_TRUE);
2006 2006                                  if (pace->ac_chan->chan_state ==
2007 2007                                      IBD_RC_STATE_ACT_ESTAB) {
2008 2008                                          INC_REF(pace, 1);
2009 2009                                          IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2010 2010                                          pace->ac_chan->chan_state =
2011 2011                                              IBD_RC_STATE_ACT_CLOSING;
2012 2012                                          ibd_rc_signal_act_close(state, pace);
2013 2013                                  } else {
2014 2014                                          state->rc_act_close_simultaneous++;
2015 2015                                          DPRINT(40, "ibd_async_link: other "
2016 2016                                              "thread is closing it, ace=%p, "
2017 2017                                              "ac_chan=%p, chan_state=%d",
2018 2018                                              pace, pace->ac_chan,
2019 2019                                              pace->ac_chan->chan_state);
2020 2020                                  }
2021 2021                          } else {
2022 2022                                  cycled = ibd_acache_recycle(state,
2023 2023                                      &pace->ac_mac, B_TRUE);
2024 2024                          }
2025 2025                          /*
2026 2026                           * If this is for an mcg, it must be for a fullmember,
2027 2027                           * since we got rid of send-only members above when
2028 2028                           * processing the mce list.
2029 2029                           */
2030 2030                          ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2031 2031                              IB_MC_JSTATE_FULL)));
2032 2032  
2033 2033                          /*
2034 2034                           * Check if the fullmember mce needs to be torn down,
2035 2035                           * ie whether the DLPI disable has already been done.
2036 2036                           * If so, do some of the work of tx_cleanup, namely
2037 2037                           * causing leave (which will fail), detach and
2038 2038                           * mce-freeing. tx_cleanup will put the AH into free
2039 2039                           * list. The reason to duplicate some of this
2040 2040                           * tx_cleanup work is because we want to delete the
2041 2041                           * AH right now instead of waiting for tx_cleanup, to
2042 2042                           * force subsequent Tx's to reacquire an AH.
2043 2043                           */
2044 2044                          if ((mce != NULL) && (mce->mc_fullreap))
2045 2045                                  ibd_async_reap_group(state, mce,
2046 2046                                      mce->mc_info.mc_adds_vect.av_dgid,
2047 2047                                      mce->mc_jstate);
2048 2048                  }
2049 2049                  mutex_exit(&state->id_ac_mutex);
2050 2050          }
2051 2051  
2052 2052          /*
2053 2053           * mac handle is guaranteed to exist since driver does ibt_close_hca()
2054 2054           * (which stops further events from being delivered) before
2055 2055           * mac_unregister(). At this point, it is guaranteed that mac_register
2056 2056           * has already been done.
2057 2057           */
2058 2058          mutex_enter(&state->id_link_mutex);
2059 2059          state->id_link_state = lstate;
2060 2060          mac_link_update(state->id_mh, lstate);
2061 2061          mutex_exit(&state->id_link_mutex);
2062 2062  
2063 2063          ibd_async_done(state);
2064 2064  }
2065 2065  
2066 2066  /*
2067 2067   * Check the pkey table to see if we can find the pkey we're looking for.
2068 2068   * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2069 2069   * failure.
2070 2070   */
2071 2071  static int
2072 2072  ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2073 2073      uint16_t *pkix)
2074 2074  {
2075 2075          uint16_t ndx;
2076 2076  
2077 2077          ASSERT(pkix != NULL);
2078 2078  
2079 2079          for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2080 2080                  if (pkey_tbl[ndx] == pkey) {
2081 2081                          *pkix = ndx;
2082 2082                          return (0);
2083 2083                  }
2084 2084          }
2085 2085          return (-1);
2086 2086  }
2087 2087  
2088 2088  /*
2089 2089   * Late HCA Initialization:
2090 2090   * If plumb had succeeded without the availability of an active port or the
2091 2091   * pkey, and either of their availability is now being indicated via PORT_UP
2092 2092   * or PORT_CHANGE respectively, try a start of the interface.
2093 2093   *
2094 2094   * Normal Operation:
2095 2095   * When the link is notified up, we need to do a few things, based
2096 2096   * on the port's current p_init_type_reply claiming a reinit has been
2097 2097   * done or not. The reinit steps are:
2098 2098   * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2099 2099   *    the old Pkey and GID0 are correct.
2100 2100   * 2. Register for mcg traps (already done by ibmf).
2101 2101   * 3. If PreservePresenceReply indicates the SM has restored port's presence
2102 2102   *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2103 2103   * 4. Give up all sendonly memberships.
2104 2104   * 5. Acquire all full memberships.
2105 2105   * 6. In promiscuous mode, acquire all non memberships.
2106 2106   * 7. Recycle all AHs to free list.
2107 2107   */
2108 2108  static void
2109 2109  ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2110 2110  {
2111 2111          ibt_hca_portinfo_t *port_infop = NULL;
2112 2112          ibt_status_t ibt_status;
2113 2113          uint_t psize, port_infosz;
2114 2114          ibd_link_op_t opcode;
2115 2115          ibd_req_t *req;
2116 2116          link_state_t new_link_state = LINK_STATE_UP;
2117 2117          uint8_t itreply;
2118 2118          uint16_t pkix;
2119 2119          int ret;
2120 2120  
2121 2121          /*
2122 2122           * Let's not race with a plumb or an unplumb; if we detect a
2123 2123           * pkey relocation event later on here, we may have to restart.
2124 2124           */
2125 2125          ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2126 2126  
2127 2127          mutex_enter(&state->id_link_mutex);
2128 2128  
2129 2129          /*
2130 2130           * If the link state is unknown, a plumb has not yet been attempted
2131 2131           * on the interface. Nothing to do.
2132 2132           */
2133 2133          if (state->id_link_state == LINK_STATE_UNKNOWN) {
2134 2134                  mutex_exit(&state->id_link_mutex);
2135 2135                  goto link_mod_return;
2136 2136          }
2137 2137  
2138 2138          /*
2139 2139           * If link state is down because of plumb failure, and we are not in
2140 2140           * late HCA init, and we were not successfully plumbed, nothing to do.
2141 2141           */
2142 2142          if ((state->id_link_state == LINK_STATE_DOWN) &&
2143 2143              ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2144 2144              ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2145 2145                  mutex_exit(&state->id_link_mutex);
2146 2146                  goto link_mod_return;
2147 2147          }
2148 2148  
2149 2149          /*
2150 2150           * If this routine was called in response to a port down event,
2151 2151           * we just need to see if this should be informed.
2152 2152           */
2153 2153          if (code == IBT_ERROR_PORT_DOWN) {
2154 2154                  new_link_state = LINK_STATE_DOWN;
2155 2155                  goto update_link_state;
2156 2156          }
2157 2157  
2158 2158          /*
2159 2159           * If it's not a port down event we've received, try to get the port
2160 2160           * attributes first. If we fail here, the port is as good as down.
2161 2161           * Otherwise, if the link went down by the time the handler gets
2162 2162           * here, give up - we cannot even validate the pkey/gid since those
2163 2163           * are not valid and this is as bad as a port down anyway.
2164 2164           */
2165 2165          ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2166 2166              &port_infop, &psize, &port_infosz);
2167 2167          if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2168 2168              (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2169 2169                  new_link_state = LINK_STATE_DOWN;
2170 2170                  goto update_link_state;
2171 2171          }
2172 2172  
2173 2173          /*
2174 2174           * If in the previous attempt, the pkey was not found either due to the
2175 2175           * port state being down, or due to it's absence in the pkey table,
2176 2176           * look for it now and try to start the interface.
2177 2177           */
2178 2178          if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2179 2179                  mutex_exit(&state->id_link_mutex);
2180 2180                  if ((ret = ibd_start(state)) != 0) {
2181 2181                          DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2182 2182                              "init, ret=%d", ret);
2183 2183                  }
2184 2184                  ibt_free_portinfo(port_infop, port_infosz);
2185 2185                  goto link_mod_return;
2186 2186          }
2187 2187  
2188 2188          /*
2189 2189           * Check the SM InitTypeReply flags. If both NoLoadReply and
2190 2190           * PreserveContentReply are 0, we don't know anything about the
2191 2191           * data loaded into the port attributes, so we need to verify
2192 2192           * if gid0 and pkey are still valid.
2193 2193           */
2194 2194          itreply = port_infop->p_init_type_reply;
2195 2195          if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2196 2196              ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2197 2197                  /*
2198 2198                   * Check to see if the subnet part of GID0 has changed. If
2199 2199                   * not, check the simple case first to see if the pkey
2200 2200                   * index is the same as before; finally check to see if the
2201 2201                   * pkey has been relocated to a different index in the table.
2202 2202                   */
2203 2203                  _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2204 2204                  if (bcmp(port_infop->p_sgid_tbl,
2205 2205                      &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2206 2206  
2207 2207                          new_link_state = LINK_STATE_DOWN;
2208 2208  
2209 2209                  } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2210 2210                      state->id_pkey) {
2211 2211  
2212 2212                          new_link_state = LINK_STATE_UP;
2213 2213  
2214 2214                  } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2215 2215                      port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2216 2216  
2217 2217                          ibt_free_portinfo(port_infop, port_infosz);
2218 2218                          mutex_exit(&state->id_link_mutex);
2219 2219  
2220 2220                          /*
2221 2221                           * Currently a restart is required if our pkey has moved
2222 2222                           * in the pkey table. If we get the ibt_recycle_ud() to
2223 2223                           * work as documented (expected), we may be able to
2224 2224                           * avoid a complete restart.  Note that we've already
2225 2225                           * marked both the start and stop 'in-progress' flags,
2226 2226                           * so it is ok to go ahead and do this restart.
2227 2227                           */
2228 2228                          (void) ibd_undo_start(state, LINK_STATE_DOWN);
2229 2229                          if ((ret = ibd_start(state)) != 0) {
2230 2230                                  DPRINT(10, "ibd_restart: cannot restart, "
2231 2231                                      "ret=%d", ret);
2232 2232                          }
2233 2233  
2234 2234                          goto link_mod_return;
2235 2235                  } else {
2236 2236                          new_link_state = LINK_STATE_DOWN;
2237 2237                  }
2238 2238                  _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2239 2239          }
2240 2240  
2241 2241  update_link_state:
2242 2242          if (port_infop) {
2243 2243                  ibt_free_portinfo(port_infop, port_infosz);
2244 2244          }
2245 2245  
2246 2246          /*
2247 2247           * If we're reporting a link up, check InitTypeReply to see if
2248 2248           * the SM has ensured that the port's presence in mcg, traps,
2249 2249           * etc. is intact.
2250 2250           */
2251 2251          if (new_link_state == LINK_STATE_DOWN) {
2252 2252                  opcode = IBD_LINK_DOWN;
2253 2253          } else {
2254 2254                  if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2255 2255                      SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2256 2256                          opcode = IBD_LINK_UP;
2257 2257                  } else {
2258 2258                          opcode = IBD_LINK_UP_ABSENT;
2259 2259                  }
2260 2260          }
2261 2261  
2262 2262          /*
2263 2263           * If the old state is the same as the new state, and the SM indicated
2264 2264           * no change in the port parameters, nothing to do.
2265 2265           */
2266 2266          if ((state->id_link_state == new_link_state) && (opcode !=
2267 2267              IBD_LINK_UP_ABSENT)) {
2268 2268                  mutex_exit(&state->id_link_mutex);
2269 2269                  goto link_mod_return;
2270 2270          }
2271 2271  
2272 2272          /*
2273 2273           * Ok, so there was a link state change; see if it's safe to ask
2274 2274           * the async thread to do the work
2275 2275           */
2276 2276          if (!ibd_async_safe(state)) {
2277 2277                  state->id_link_state = new_link_state;
2278 2278                  mutex_exit(&state->id_link_mutex);
2279 2279                  goto link_mod_return;
2280 2280          }
2281 2281  
2282 2282          mutex_exit(&state->id_link_mutex);
2283 2283  
2284 2284          /*
2285 2285           * Queue up a request for ibd_async_link() to handle this link
2286 2286           * state change event
2287 2287           */
2288 2288          req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2289 2289          req->rq_ptr = (void *)opcode;
2290 2290          ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2291 2291  
2292 2292  link_mod_return:
2293 2293          ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2294 2294  }
2295 2295  
2296 2296  /*
2297 2297   * For the port up/down events, IBTL guarantees there will not be concurrent
2298 2298   * invocations of the handler. IBTL might coalesce link transition events,
2299 2299   * and not invoke the handler for _each_ up/down transition, but it will
2300 2300   * invoke the handler with last known state
2301 2301   */
2302 2302  static void
2303 2303  ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2304 2304      ibt_async_code_t code, ibt_async_event_t *event)
2305 2305  {
2306 2306          ibd_state_t *state = (ibd_state_t *)clnt_private;
2307 2307  
2308 2308          switch (code) {
2309 2309          case IBT_ERROR_CATASTROPHIC_CHAN:
2310 2310                  ibd_print_warn(state, "catastrophic channel error");
2311 2311                  break;
2312 2312          case IBT_ERROR_CQ:
2313 2313                  ibd_print_warn(state, "completion queue error");
2314 2314                  break;
2315 2315          case IBT_PORT_CHANGE_EVENT:
2316 2316                  /*
2317 2317                   * Events will be delivered to all instances that have
2318 2318                   * done ibt_open_hca() but not yet done ibt_close_hca().
2319 2319                   * Only need to do work for our port; IBTF will deliver
2320 2320                   * events for other ports on the hca we have ibt_open_hca'ed
2321 2321                   * too. Note that id_port is initialized in ibd_attach()
2322 2322                   * before we do an ibt_open_hca() in ibd_attach().
2323 2323                   */
2324 2324                  ASSERT(state->id_hca_hdl == hca_hdl);
2325 2325                  if (state->id_port != event->ev_port)
2326 2326                          break;
2327 2327  
2328 2328                  if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2329 2329                      IBT_PORT_CHANGE_PKEY) {
2330 2330                          ibd_link_mod(state, code);
2331 2331                  }
2332 2332                  break;
2333 2333          case IBT_ERROR_PORT_DOWN:
2334 2334          case IBT_CLNT_REREG_EVENT:
2335 2335          case IBT_EVENT_PORT_UP:
2336 2336                  /*
2337 2337                   * Events will be delivered to all instances that have
2338 2338                   * done ibt_open_hca() but not yet done ibt_close_hca().
2339 2339                   * Only need to do work for our port; IBTF will deliver
2340 2340                   * events for other ports on the hca we have ibt_open_hca'ed
2341 2341                   * too. Note that id_port is initialized in ibd_attach()
2342 2342                   * before we do an ibt_open_hca() in ibd_attach().
2343 2343                   */
2344 2344                  ASSERT(state->id_hca_hdl == hca_hdl);
2345 2345                  if (state->id_port != event->ev_port)
2346 2346                          break;
2347 2347  
2348 2348                  ibd_link_mod(state, code);
2349 2349                  break;
2350 2350  
2351 2351          case IBT_HCA_ATTACH_EVENT:
2352 2352          case IBT_HCA_DETACH_EVENT:
2353 2353                  /*
2354 2354                   * When a new card is plugged to the system, attach_event is
2355 2355                   * invoked. Additionally, a cfgadm needs to be run to make the
2356 2356                   * card known to the system, and an ifconfig needs to be run to
2357 2357                   * plumb up any ibd interfaces on the card. In the case of card
2358 2358                   * unplug, a cfgadm is run that will trigger any RCM scripts to
2359 2359                   * unplumb the ibd interfaces on the card; when the card is
2360 2360                   * actually unplugged, the detach_event is invoked;
2361 2361                   * additionally, if any ibd instances are still active on the
2362 2362                   * card (eg there were no associated RCM scripts), driver's
2363 2363                   * detach routine is invoked.
2364 2364                   */
2365 2365                  break;
2366 2366          default:
2367 2367                  break;
2368 2368          }
2369 2369  }
2370 2370  
2371 2371  static int
2372 2372  ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2373 2373  {
2374 2374          mac_register_t *macp;
2375 2375          int ret;
2376 2376  
2377 2377          if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2378 2378                  DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2379 2379                  return (DDI_FAILURE);
2380 2380          }
2381 2381  
2382 2382          /*
2383 2383           * Note that when we register with mac during attach, we don't
2384 2384           * have the id_macaddr yet, so we'll simply be registering a
2385 2385           * zero macaddr that we'll overwrite later during plumb (in
2386 2386           * ibd_m_start()). Similar is the case with id_mtu - we'll
2387 2387           * update the mac layer with the correct mtu during plumb.
2388 2388           */
2389 2389          macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2390 2390          macp->m_driver = state;
2391 2391          macp->m_dip = dip;
2392 2392          macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2393 2393          macp->m_callbacks = &ibd_m_callbacks;
2394 2394          macp->m_min_sdu = 0;
2395 2395          macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2396 2396          if (state->id_type == IBD_PORT_DRIVER) {
2397 2397                  macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2398 2398          } else if (state->id_enable_rc) {
2399 2399                  macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2400 2400          } else {
2401 2401                  macp->m_max_sdu = IBD_DEF_MAX_SDU;
2402 2402          }
2403 2403          macp->m_priv_props = ibd_priv_props;
2404 2404  
2405 2405          /*
2406 2406           *  Register ourselves with the GLDv3 interface
2407 2407           */
2408 2408          if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2409 2409                  mac_free(macp);
2410 2410                  DPRINT(10,
2411 2411                      "ibd_register_mac: mac_register() failed, ret=%d", ret);
2412 2412                  return (DDI_FAILURE);
2413 2413          }
2414 2414  
2415 2415          mac_free(macp);
2416 2416          return (DDI_SUCCESS);
2417 2417  }
2418 2418  
2419 2419  static int
2420 2420  ibd_record_capab(ibd_state_t *state)
2421 2421  {
2422 2422          ibt_hca_attr_t hca_attrs;
2423 2423          ibt_status_t ibt_status;
2424 2424  
2425 2425          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2426 2426  
2427 2427          /*
2428 2428           * Query the HCA and fetch its attributes
2429 2429           */
2430 2430          ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2431 2431          ASSERT(ibt_status == IBT_SUCCESS);
2432 2432  
2433 2433          /*
2434 2434           * 1. Set the Hardware Checksum capability. Currently we only consider
2435 2435           *    full checksum offload.
2436 2436           */
2437 2437          if (state->id_enable_rc) {
2438 2438                          state->id_hwcksum_capab = 0;
2439 2439          } else {
2440 2440                  if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2441 2441                      == IBT_HCA_CKSUM_FULL) {
2442 2442                          state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2443 2443                  }
2444 2444          }
2445 2445  
2446 2446          /*
2447 2447           * 2. Set LSO policy, capability and maximum length
2448 2448           */
2449 2449          if (state->id_enable_rc) {
2450 2450                  state->id_lso_capable = B_FALSE;
2451 2451                  state->id_lso_maxlen = 0;
2452 2452          } else {
2453 2453                  if (hca_attrs.hca_max_lso_size > 0) {
2454 2454                          state->id_lso_capable = B_TRUE;
2455 2455                          if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2456 2456                                  state->id_lso_maxlen = IBD_LSO_MAXLEN;
2457 2457                          else
2458 2458                                  state->id_lso_maxlen =
2459 2459                                      hca_attrs.hca_max_lso_size;
2460 2460                  } else {
2461 2461                          state->id_lso_capable = B_FALSE;
2462 2462                          state->id_lso_maxlen = 0;
2463 2463                  }
2464 2464          }
2465 2465  
2466 2466          /*
2467 2467           * 3. Set Reserved L_Key capability
2468 2468           */
2469 2469          if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2470 2470                  state->id_hca_res_lkey_capab = 1;
2471 2471                  state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2472 2472                  state->rc_enable_iov_map = B_TRUE;
2473 2473          } else {
2474 2474                  /* If no reserved lkey, we will not use ibt_map_mem_iov */
2475 2475                  state->rc_enable_iov_map = B_FALSE;
2476 2476          }
2477 2477  
2478 2478          /*
2479 2479           * 4. Set maximum sqseg value after checking to see if extended sgl
2480 2480           *    size information is provided by the hca
2481 2481           */
2482 2482          if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2483 2483                  state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2484 2484                  state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2485 2485          } else {
2486 2486                  state->id_max_sqseg = hca_attrs.hca_max_sgl;
2487 2487                  state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2488 2488          }
2489 2489          if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2490 2490                  state->id_max_sqseg = IBD_MAX_SQSEG;
2491 2491          } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2492 2492                  ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2493 2493                      state->id_max_sqseg, IBD_MAX_SQSEG);
2494 2494          }
2495 2495          if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2496 2496                  state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2497 2497          } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2498 2498                  ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2499 2499                      "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2500 2500          }
2501 2501  
2502 2502          /*
2503 2503           * Translating the virtual address regions into physical regions
2504 2504           * for using the Reserved LKey feature results in a wr sgl that
2505 2505           * is a little longer. Since failing ibt_map_mem_iov() is costly,
2506 2506           * we'll fix a high-water mark (65%) for when we should stop.
2507 2507           */
2508 2508          state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2509 2509          state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2510 2510  
2511 2511          /*
2512 2512           * 5. Set number of recv and send wqes after checking hca maximum
2513 2513           *    channel size. Store the max channel size in the state so that it
2514 2514           *    can be referred to when the swqe/rwqe change is requested via
2515 2515           *    dladm.
2516 2516           */
2517 2517  
2518 2518          state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2519 2519  
2520 2520          if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2521 2521                  state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2522 2522  
2523 2523          state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2524 2524              IBD_RWQE_MIN;
2525 2525  
2526 2526          if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2527 2527                  state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2528 2528  
2529 2529          _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2530 2530  
2531 2531          return (DDI_SUCCESS);
2532 2532  }
2533 2533  
2534 2534  static int
2535 2535  ibd_part_busy(ibd_state_t *state)
2536 2536  {
2537 2537          if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2538 2538                  DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2539 2539                  return (DDI_FAILURE);
2540 2540          }
2541 2541  
2542 2542          if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2543 2543                  DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2544 2544                  return (DDI_FAILURE);
2545 2545          }
2546 2546  
2547 2547          /*
2548 2548           * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2549 2549           * connecting to a remote IPoIB port. We can't remove this port.
2550 2550           */
2551 2551          if (state->id_ah_op == IBD_OP_ONGOING) {
2552 2552                  DPRINT(10, "ibd_part_busy: failed: connecting\n");
2553 2553                  return (DDI_FAILURE);
2554 2554          }
2555 2555  
2556 2556          return (DDI_SUCCESS);
2557 2557  }
2558 2558  
2559 2559  
2560 2560  static void
2561 2561  ibd_part_unattach(ibd_state_t *state)
2562 2562  {
2563 2563          uint32_t progress = state->id_mac_state;
2564 2564          ibt_status_t ret;
2565 2565  
2566 2566          /* make sure rx resources are freed */
2567 2567          ibd_free_rx_rsrcs(state);
2568 2568  
2569 2569          if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2570 2570                  ASSERT(state->id_enable_rc);
2571 2571                  ibd_rc_fini_srq_list(state);
2572 2572                  state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2573 2573          }
2574 2574  
2575 2575          if (progress & IBD_DRV_MAC_REGISTERED) {
2576 2576                  (void) mac_unregister(state->id_mh);
2577 2577                  state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2578 2578          }
2579 2579  
2580 2580          if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2581 2581                  /*
2582 2582                   * No new async requests will be posted since the device
2583 2583                   * link state has been marked as unknown; completion handlers
2584 2584                   * have been turned off, so Tx handler will not cause any
2585 2585                   * more IBD_ASYNC_REAP requests.
2586 2586                   *
2587 2587                   * Queue a request for the async thread to exit, which will
2588 2588                   * be serviced after any pending ones. This can take a while,
2589 2589                   * specially if the SM is unreachable, since IBMF will slowly
2590 2590                   * timeout each SM request issued by the async thread.  Reap
2591 2591                   * the thread before continuing on, we do not want it to be
2592 2592                   * lingering in modunloaded code.
2593 2593                   */
2594 2594                  ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2595 2595                  thread_join(state->id_async_thrid);
2596 2596  
2597 2597                  state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2598 2598          }
2599 2599  
2600 2600          if (progress & IBD_DRV_REQ_LIST_INITED) {
2601 2601                  list_destroy(&state->id_req_list);
2602 2602                  mutex_destroy(&state->id_acache_req_lock);
2603 2603                  cv_destroy(&state->id_acache_req_cv);
2604 2604                  state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2605 2605          }
2606 2606  
2607 2607          if (progress & IBD_DRV_PD_ALLOCD) {
2608 2608                  if ((ret = ibt_free_pd(state->id_hca_hdl,
2609 2609                      state->id_pd_hdl)) != IBT_SUCCESS) {
2610 2610                          ibd_print_warn(state, "failed to free "
2611 2611                              "protection domain, ret=%d", ret);
2612 2612                  }
2613 2613                  state->id_pd_hdl = NULL;
2614 2614                  state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2615 2615          }
2616 2616  
2617 2617          if (progress & IBD_DRV_HCA_OPENED) {
2618 2618                  if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2619 2619                      IBT_SUCCESS) {
2620 2620                          ibd_print_warn(state, "failed to close "
2621 2621                              "HCA device, ret=%d", ret);
2622 2622                  }
2623 2623                  state->id_hca_hdl = NULL;
2624 2624                  state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2625 2625          }
2626 2626  
2627 2627          mutex_enter(&ibd_gstate.ig_mutex);
2628 2628          if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2629 2629                  if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2630 2630                      IBT_SUCCESS) {
2631 2631                          ibd_print_warn(state,
2632 2632                              "ibt_detach() failed, ret=%d", ret);
2633 2633                  }
2634 2634                  state->id_ibt_hdl = NULL;
2635 2635                  state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2636 2636                  ibd_gstate.ig_ibt_hdl_ref_cnt--;
2637 2637          }
2638 2638          if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2639 2639              (ibd_gstate.ig_ibt_hdl != NULL)) {
2640 2640                  if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2641 2641                      IBT_SUCCESS) {
2642 2642                          ibd_print_warn(state, "ibt_detach(): global "
2643 2643                              "failed, ret=%d", ret);
2644 2644                  }
2645 2645                  ibd_gstate.ig_ibt_hdl = NULL;
2646 2646          }
2647 2647          mutex_exit(&ibd_gstate.ig_mutex);
2648 2648  
2649 2649          if (progress & IBD_DRV_TXINTR_ADDED) {
2650 2650                  ddi_remove_softintr(state->id_tx);
2651 2651                  state->id_tx = NULL;
2652 2652                  state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2653 2653          }
2654 2654  
2655 2655          if (progress & IBD_DRV_RXINTR_ADDED) {
2656 2656                  ddi_remove_softintr(state->id_rx);
2657 2657                  state->id_rx = NULL;
2658 2658                  state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2659 2659          }
2660 2660  
2661 2661  #ifdef DEBUG
2662 2662          if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2663 2663                  kstat_delete(state->rc_ksp);
2664 2664                  state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2665 2665          }
2666 2666  #endif
2667 2667  
2668 2668          if (progress & IBD_DRV_STATE_INITIALIZED) {
2669 2669                  ibd_state_fini(state);
2670 2670                  state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2671 2671          }
2672 2672  }
2673 2673  
2674 2674  int
2675 2675  ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2676 2676  {
2677 2677          ibt_status_t ret;
2678 2678          int rv;
2679 2679          kthread_t *kht;
2680 2680  
2681 2681          /*
2682 2682           * Initialize mutexes and condition variables
2683 2683           */
2684 2684          if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2685 2685                  DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2686 2686                  return (DDI_FAILURE);
2687 2687          }
2688 2688          state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2689 2689  
2690 2690          /*
2691 2691           * Allocate rx,tx softintr
2692 2692           */
2693 2693          if (ibd_rx_softintr == 1) {
2694 2694                  if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2695 2695                      NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2696 2696                          DPRINT(10, "ibd_part_attach: failed in "
2697 2697                              "ddi_add_softintr(id_rx),  ret=%d", rv);
2698 2698                          return (DDI_FAILURE);
2699 2699                  }
2700 2700                  state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2701 2701          }
2702 2702          if (ibd_tx_softintr == 1) {
2703 2703                  if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2704 2704                      NULL, NULL, ibd_tx_recycle,
2705 2705                      (caddr_t)state)) != DDI_SUCCESS) {
2706 2706                          DPRINT(10, "ibd_part_attach: failed in "
2707 2707                              "ddi_add_softintr(id_tx), ret=%d", rv);
2708 2708                          return (DDI_FAILURE);
2709 2709                  }
2710 2710                  state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2711 2711          }
2712 2712  
2713 2713          /*
2714 2714           * Attach to IBTL
2715 2715           */
2716 2716          mutex_enter(&ibd_gstate.ig_mutex);
2717 2717          if (ibd_gstate.ig_ibt_hdl == NULL) {
2718 2718                  if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2719 2719                      &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2720 2720                          DPRINT(10, "ibd_part_attach: global: failed in "
2721 2721                              "ibt_attach(), ret=%d", ret);
2722 2722                          mutex_exit(&ibd_gstate.ig_mutex);
2723 2723                          return (DDI_FAILURE);
2724 2724                  }
2725 2725          }
2726 2726          if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2727 2727              &state->id_ibt_hdl)) != IBT_SUCCESS) {
2728 2728                  DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2729 2729                      ret);
2730 2730                  mutex_exit(&ibd_gstate.ig_mutex);
2731 2731                  return (DDI_FAILURE);
2732 2732          }
2733 2733          ibd_gstate.ig_ibt_hdl_ref_cnt++;
2734 2734          mutex_exit(&ibd_gstate.ig_mutex);
2735 2735          state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2736 2736  
2737 2737          /*
2738 2738           * Open the HCA
2739 2739           */
2740 2740          if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2741 2741              &state->id_hca_hdl)) != IBT_SUCCESS) {
2742 2742                  DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2743 2743                      ret);
2744 2744                  return (DDI_FAILURE);
2745 2745          }
2746 2746          state->id_mac_state |= IBD_DRV_HCA_OPENED;
2747 2747  
2748 2748  #ifdef DEBUG
2749 2749          /* Initialize Driver Counters for Reliable Connected Mode */
2750 2750          if (state->id_enable_rc) {
2751 2751                  if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2752 2752                          DPRINT(10, "ibd_part_attach: failed in "
2753 2753                              "ibd_rc_init_stats");
2754 2754                          return (DDI_FAILURE);
2755 2755                  }
2756 2756                  state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2757 2757          }
2758 2758  #endif
2759 2759  
2760 2760          /*
2761 2761           * Record capabilities
2762 2762           */
2763 2763          (void) ibd_record_capab(state);
2764 2764  
2765 2765          /*
2766 2766           * Allocate a protection domain on the HCA
2767 2767           */
2768 2768          if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2769 2769              &state->id_pd_hdl)) != IBT_SUCCESS) {
2770 2770                  DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2771 2771                      ret);
2772 2772                  return (DDI_FAILURE);
2773 2773          }
2774 2774          state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2775 2775  
2776 2776  
2777 2777          /*
2778 2778           * We need to initialise the req_list that is required for the
2779 2779           * operation of the async_thread.
2780 2780           */
2781 2781          mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2782 2782          cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2783 2783          list_create(&state->id_req_list, sizeof (ibd_req_t),
2784 2784              offsetof(ibd_req_t, rq_list));
2785 2785          state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2786 2786  
2787 2787          /*
2788 2788           * Create the async thread; thread_create never fails.
2789 2789           */
2790 2790          kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2791 2791              TS_RUN, minclsyspri);
2792 2792          state->id_async_thrid = kht->t_did;
2793 2793          state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2794 2794  
2795 2795          return (DDI_SUCCESS);
2796 2796  }
2797 2797  
2798 2798  /*
2799 2799   * Attach device to the IO framework.
2800 2800   */
2801 2801  static int
2802 2802  ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2803 2803  {
2804 2804          int ret;
2805 2805  
2806 2806          switch (cmd) {
2807 2807                  case DDI_ATTACH:
2808 2808                          ret = ibd_port_attach(dip);
2809 2809                          break;
2810 2810                  default:
2811 2811                          ret = DDI_FAILURE;
2812 2812                          break;
2813 2813          }
2814 2814          return (ret);
2815 2815  }
2816 2816  
2817 2817  /*
2818 2818   * Detach device from the IO framework.
2819 2819   */
2820 2820  static int
2821 2821  ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2822 2822  {
2823 2823          ibd_state_t *state;
2824 2824          int instance;
2825 2825  
2826 2826          /*
2827 2827           * IBD doesn't support suspend/resume
2828 2828           */
2829 2829          if (cmd != DDI_DETACH)
2830 2830                  return (DDI_FAILURE);
2831 2831  
2832 2832          /*
2833 2833           * Get the instance softstate
2834 2834           */
2835 2835          instance = ddi_get_instance(dip);
2836 2836          state = ddi_get_soft_state(ibd_list, instance);
2837 2837  
2838 2838          /*
2839 2839           * Release all resources we're holding still.  Note that if we'd
2840 2840           * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2841 2841           * so far, we should find all the flags we need in id_mac_state.
2842 2842           */
2843 2843          return (ibd_port_unattach(state, dip));
2844 2844  }
2845 2845  
2846 2846  /*
2847 2847   * Pre ibt_attach() driver initialization
2848 2848   */
2849 2849  static int
2850 2850  ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2851 2851  {
2852 2852          char buf[64];
2853 2853  
2854 2854          mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2855 2855          state->id_link_state = LINK_STATE_UNKNOWN;
2856 2856  
2857 2857          mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2858 2858          cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2859 2859          state->id_trap_stop = B_TRUE;
2860 2860          state->id_trap_inprog = 0;
2861 2861  
2862 2862          mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2863 2863          mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2864 2864          state->id_dip = dip;
2865 2865  
2866 2866          mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2867 2867  
2868 2868          mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2869 2869          mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2870 2870          mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2871 2871          state->id_tx_busy = 0;
2872 2872          mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2873 2873  
2874 2874          state->id_rx_list.dl_bufs_outstanding = 0;
2875 2875          state->id_rx_list.dl_cnt = 0;
2876 2876          mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2877 2877          mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2878 2878          (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2879 2879              state->id_pkey, state->id_plinkid);
2880 2880          state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2881 2881              0, NULL, NULL, NULL, NULL, NULL, 0);
2882 2882  
2883 2883          /* For Reliable Connected Mode */
2884 2884          mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2885 2885          mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2886 2886          mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2887 2887          mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2888 2888          mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2889 2889              MUTEX_DRIVER, NULL);
2890 2890          mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2891 2891  
2892 2892          /*
2893 2893           * Make the default link mode as RC. If this fails during connection
2894 2894           * setup, the link mode is automatically transitioned to UD.
2895 2895           * Also set the RC MTU.
2896 2896           */
2897 2897          state->id_enable_rc = IBD_DEF_LINK_MODE;
2898 2898          state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2899 2899          state->id_mtu = IBD_DEF_MAX_MTU;
2900 2900  
2901 2901          /* Iniatialize all tunables to default */
2902 2902          state->id_lso_policy = IBD_DEF_LSO_POLICY;
2903 2903          state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2904 2904          state->id_num_ah = IBD_DEF_NUM_AH;
2905 2905          state->id_hash_size = IBD_DEF_HASH_SIZE;
2906 2906          state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2907 2907          state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2908 2908          state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2909 2909          state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2910 2910          state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2911 2911          state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2912 2912          state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2913 2913          state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2914 2914          state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2915 2915          state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2916 2916          state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2917 2917          state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2918 2918          state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2919 2919          state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2920 2920          state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2921 2921          state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2922 2922          state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2923 2923          state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2924 2924          state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2925 2925          state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2926 2926  
2927 2927          return (DDI_SUCCESS);
2928 2928  }
2929 2929  
2930 2930  /*
2931 2931   * Post ibt_detach() driver deconstruction
2932 2932   */
2933 2933  static void
2934 2934  ibd_state_fini(ibd_state_t *state)
2935 2935  {
2936 2936          kmem_cache_destroy(state->id_req_kmc);
2937 2937  
2938 2938          mutex_destroy(&state->id_rx_list.dl_mutex);
2939 2939          mutex_destroy(&state->id_rx_free_list.dl_mutex);
2940 2940  
2941 2941          mutex_destroy(&state->id_txpost_lock);
2942 2942          mutex_destroy(&state->id_tx_list.dl_mutex);
2943 2943          mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2944 2944          mutex_destroy(&state->id_lso_lock);
2945 2945  
2946 2946          mutex_destroy(&state->id_sched_lock);
2947 2947          mutex_destroy(&state->id_scq_poll_lock);
2948 2948          mutex_destroy(&state->id_rcq_poll_lock);
2949 2949  
2950 2950          cv_destroy(&state->id_trap_cv);
2951 2951          mutex_destroy(&state->id_trap_lock);
2952 2952          mutex_destroy(&state->id_link_mutex);
2953 2953  
2954 2954          /* For Reliable Connected Mode */
2955 2955          mutex_destroy(&state->rc_timeout_lock);
2956 2956          mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2957 2957          mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2958 2958          mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2959 2959          mutex_destroy(&state->rc_tx_large_bufs_lock);
2960 2960          mutex_destroy(&state->rc_rx_lock);
2961 2961  }
2962 2962  
2963 2963  /*
2964 2964   * Fetch link speed from SA for snmp ifspeed reporting.
2965 2965   */
2966 2966  static uint64_t
2967 2967  ibd_get_portspeed(ibd_state_t *state)
2968 2968  {
2969 2969          int                     ret;
2970 2970          ibt_path_info_t         path;
2971 2971          ibt_path_attr_t         path_attr;
2972 2972          uint8_t                 num_paths;
2973 2973          uint64_t                ifspeed;
2974 2974  
2975 2975          /*
2976 2976           * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2977 2977           * translates to 2 Gbps data rate. Thus, 1X single data rate is
2978 2978           * 2000000000. Start with that as default.
2979 2979           */
2980 2980          ifspeed = 2000000000;
2981 2981  
2982 2982          bzero(&path_attr, sizeof (path_attr));
2983 2983  
2984 2984          /*
2985 2985           * Get the port speed from Loopback path information.
2986 2986           */
2987 2987          path_attr.pa_dgids = &state->id_sgid;
2988 2988          path_attr.pa_num_dgids = 1;
2989 2989          path_attr.pa_sgid = state->id_sgid;
2990 2990  
2991 2991          if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2992 2992              &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2993 2993                  goto earlydone;
2994 2994  
2995 2995          if (num_paths < 1)
2996 2996                  goto earlydone;
2997 2997  
2998 2998          /*
2999 2999           * In case SA does not return an expected value, report the default
3000 3000           * speed as 1X.
3001 3001           */
3002 3002          ret = 1;
3003 3003          switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3004 3004                  case IBT_SRATE_2:       /*  1X SDR i.e 2.5 Gbps */
3005 3005                          ret = 1;
3006 3006                          break;
3007 3007                  case IBT_SRATE_10:      /*  4X SDR or 1X QDR i.e 10 Gbps */
3008 3008                          ret = 4;
3009 3009                          break;
3010 3010                  case IBT_SRATE_30:      /* 12X SDR i.e 30 Gbps */
3011 3011                          ret = 12;
3012 3012                          break;
3013 3013                  case IBT_SRATE_5:       /*  1X DDR i.e  5 Gbps */
3014 3014                          ret = 2;
3015 3015                          break;
3016 3016                  case IBT_SRATE_20:      /*  4X DDR or 8X SDR i.e 20 Gbps */
3017 3017                          ret = 8;
3018 3018                          break;
3019 3019                  case IBT_SRATE_40:      /*  8X DDR or 4X QDR i.e 40 Gbps */
3020 3020                          ret = 16;
3021 3021                          break;
3022 3022                  case IBT_SRATE_60:      /* 12X DDR i.e 60 Gbps */
3023 3023                          ret = 24;
3024 3024                          break;
3025 3025                  case IBT_SRATE_80:      /*  8X QDR i.e 80 Gbps */
3026 3026                          ret = 32;
3027 3027                          break;
3028 3028                  case IBT_SRATE_120:     /* 12X QDR i.e 120 Gbps */
3029 3029                          ret = 48;
3030 3030                          break;
3031 3031          }
3032 3032  
3033 3033          ifspeed *= ret;
3034 3034  
3035 3035  earlydone:
3036 3036          return (ifspeed);
3037 3037  }
3038 3038  
3039 3039  /*
3040 3040   * Search input mcg list (id_mc_full or id_mc_non) for an entry
3041 3041   * representing the input mcg mgid.
3042 3042   */
3043 3043  static ibd_mce_t *
3044 3044  ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3045 3045  {
3046 3046          ibd_mce_t *ptr = list_head(mlist);
3047 3047  
3048 3048          /*
3049 3049           * Do plain linear search.
3050 3050           */
3051 3051          while (ptr != NULL) {
3052 3052                  if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3053 3053                      sizeof (ib_gid_t)) == 0)
3054 3054                          return (ptr);
3055 3055                  ptr = list_next(mlist, ptr);
3056 3056          }
3057 3057          return (NULL);
3058 3058  }
3059 3059  
3060 3060  /*
3061 3061   * Execute IBA JOIN.
3062 3062   */
3063 3063  static ibt_status_t
3064 3064  ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3065 3065  {
3066 3066          ibt_mcg_attr_t mcg_attr;
3067 3067  
3068 3068          bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3069 3069          mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3070 3070          mcg_attr.mc_mgid = mgid;
3071 3071          mcg_attr.mc_join_state = mce->mc_jstate;
3072 3072          mcg_attr.mc_scope = state->id_scope;
3073 3073          mcg_attr.mc_pkey = state->id_pkey;
3074 3074          mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3075 3075          mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3076 3076          mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3077 3077          return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3078 3078              NULL, NULL));
3079 3079  }
3080 3080  
3081 3081  /*
3082 3082   * This code JOINs the port in the proper way (depending on the join
3083 3083   * state) so that IBA fabric will forward mcg packets to/from the port.
3084 3084   * It also attaches the QPN to the mcg so it can receive those mcg
3085 3085   * packets. This code makes sure not to attach the mcg to the QP if
3086 3086   * that has been previously done due to the mcg being joined with a
3087 3087   * different join state, even though this is not required by SWG_0216,
3088 3088   * refid 3610.
3089 3089   */
3090 3090  static ibd_mce_t *
3091 3091  ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3092 3092  {
3093 3093          ibt_status_t ibt_status;
3094 3094          ibd_mce_t *mce, *tmce, *omce = NULL;
3095 3095          boolean_t do_attach = B_TRUE;
3096 3096  
3097 3097          DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3098 3098              jstate, mgid.gid_prefix, mgid.gid_guid);
3099 3099  
3100 3100          /*
3101 3101           * For enable_multicast Full member joins, we need to do some
3102 3102           * extra work. If there is already an mce on the list that
3103 3103           * indicates full membership, that means the membership has
3104 3104           * not yet been dropped (since the disable_multicast was issued)
3105 3105           * because there are pending Tx's to the mcg; in that case, just
3106 3106           * mark the mce not to be reaped when the Tx completion queues
3107 3107           * an async reap operation.
3108 3108           *
3109 3109           * If there is already an mce on the list indicating sendonly
3110 3110           * membership, try to promote to full membership. Be careful
3111 3111           * not to deallocate the old mce, since there might be an AH
3112 3112           * pointing to it; instead, update the old mce with new data
3113 3113           * that tracks the full membership.
3114 3114           */
3115 3115          if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3116 3116              IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3117 3117                  if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3118 3118                          ASSERT(omce->mc_fullreap);
3119 3119                          omce->mc_fullreap = B_FALSE;
3120 3120                          return (omce);
3121 3121                  } else {
3122 3122                          ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3123 3123                  }
3124 3124          }
3125 3125  
3126 3126          /*
3127 3127           * Allocate the ibd_mce_t to track this JOIN.
3128 3128           */
3129 3129          mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3130 3130          mce->mc_fullreap = B_FALSE;
3131 3131          mce->mc_jstate = jstate;
3132 3132  
3133 3133          if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3134 3134                  DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3135 3135                      ibt_status);
3136 3136                  kmem_free(mce, sizeof (ibd_mce_t));
3137 3137                  return (NULL);
3138 3138          }
3139 3139  
3140 3140          /*
3141 3141           * Is an IBA attach required? Not if the interface is already joined
3142 3142           * to the mcg in a different appropriate join state.
3143 3143           */
3144 3144          if (jstate == IB_MC_JSTATE_NON) {
3145 3145                  tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3146 3146                  if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3147 3147                          do_attach = B_FALSE;
3148 3148          } else if (jstate == IB_MC_JSTATE_FULL) {
3149 3149                  if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3150 3150                          do_attach = B_FALSE;
3151 3151          } else {        /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3152 3152                  do_attach = B_FALSE;
3153 3153          }
3154 3154  
3155 3155          if (do_attach) {
3156 3156                  /*
3157 3157                   * Do the IBA attach.
3158 3158                   */
3159 3159                  DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3160 3160                  if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3161 3161                      &mce->mc_info)) != IBT_SUCCESS) {
3162 3162                          DPRINT(10, "ibd_join_group : failed qp attachment "
3163 3163                              "%d\n", ibt_status);
3164 3164                          /*
3165 3165                           * NOTE that we should probably preserve the join info
3166 3166                           * in the list and later try to leave again at detach
3167 3167                           * time.
3168 3168                           */
3169 3169                          (void) ibt_leave_mcg(state->id_sgid, mgid,
3170 3170                              state->id_sgid, jstate);
3171 3171                          kmem_free(mce, sizeof (ibd_mce_t));
3172 3172                          return (NULL);
3173 3173                  }
3174 3174          }
3175 3175  
3176 3176          /*
3177 3177           * Insert the ibd_mce_t in the proper list.
3178 3178           */
3179 3179          if (jstate == IB_MC_JSTATE_NON) {
3180 3180                  IBD_MCACHE_INSERT_NON(state, mce);
3181 3181          } else {
3182 3182                  /*
3183 3183                   * Set up the mc_req fields used for reaping the
3184 3184                   * mcg in case of delayed tx completion (see
3185 3185                   * ibd_tx_cleanup()). Also done for sendonly join in
3186 3186                   * case we are promoted to fullmembership later and
3187 3187                   * keep using the same mce.
3188 3188                   */
3189 3189                  mce->mc_req.rq_gid = mgid;
3190 3190                  mce->mc_req.rq_ptr = mce;
3191 3191                  /*
3192 3192                   * Check whether this is the case of trying to join
3193 3193                   * full member, and we were already joined send only.
3194 3194                   * We try to drop our SendOnly membership, but it is
3195 3195                   * possible that the mcg does not exist anymore (and
3196 3196                   * the subnet trap never reached us), so the leave
3197 3197                   * operation might fail.
3198 3198                   */
3199 3199                  if (omce != NULL) {
3200 3200                          (void) ibt_leave_mcg(state->id_sgid, mgid,
3201 3201                              state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3202 3202                          omce->mc_jstate = IB_MC_JSTATE_FULL;
3203 3203                          bcopy(&mce->mc_info, &omce->mc_info,
3204 3204                              sizeof (ibt_mcg_info_t));
3205 3205                          kmem_free(mce, sizeof (ibd_mce_t));
3206 3206                          return (omce);
3207 3207                  }
3208 3208                  mutex_enter(&state->id_mc_mutex);
3209 3209                  IBD_MCACHE_INSERT_FULL(state, mce);
3210 3210                  mutex_exit(&state->id_mc_mutex);
3211 3211          }
3212 3212  
3213 3213          return (mce);
3214 3214  }
3215 3215  
3216 3216  /*
3217 3217   * Called during port up event handling to attempt to reacquire full
3218 3218   * membership to an mcg. Stripped down version of ibd_join_group().
3219 3219   * Note that it is possible that the mcg might have gone away, and
3220 3220   * gets recreated at this point.
3221 3221   */
3222 3222  static void
3223 3223  ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3224 3224  {
3225 3225          ib_gid_t mgid;
3226 3226  
3227 3227          /*
3228 3228           * If the mc_fullreap flag is set, or this join fails, a subsequent
3229 3229           * reap/leave is going to try to leave the group. We could prevent
3230 3230           * that by adding a boolean flag into ibd_mce_t, if required.
3231 3231           */
3232 3232          if (mce->mc_fullreap)
3233 3233                  return;
3234 3234  
3235 3235          mgid = mce->mc_info.mc_adds_vect.av_dgid;
3236 3236  
3237 3237          DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3238 3238              mgid.gid_guid);
3239 3239  
3240 3240          /* While reacquiring, leave and then join the MCG */
3241 3241          (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3242 3242              mce->mc_jstate);
3243 3243          if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3244 3244                  ibd_print_warn(state, "Failure on port up to rejoin "
3245 3245                      "multicast gid %016llx:%016llx",
3246 3246                      (u_longlong_t)mgid.gid_prefix,
3247 3247                      (u_longlong_t)mgid.gid_guid);
3248 3248  }
3249 3249  
3250 3250  /*
3251 3251   * This code handles delayed Tx completion cleanups for mcg's to which
3252 3252   * disable_multicast has been issued, regular mcg related cleanups during
3253 3253   * disable_multicast, disable_promiscuous and mcg traps, as well as
3254 3254   * cleanups during driver detach time. Depending on the join state,
3255 3255   * it deletes the mce from the appropriate list and issues the IBA
3256 3256   * leave/detach; except in the disable_multicast case when the mce
3257 3257   * is left on the active list for a subsequent Tx completion cleanup.
3258 3258   */
3259 3259  static void
3260 3260  ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3261 3261      uint8_t jstate)
3262 3262  {
3263 3263          ibd_mce_t *tmce;
3264 3264          boolean_t do_detach = B_TRUE;
3265 3265  
3266 3266          /*
3267 3267           * Before detaching, we must check whether the other list
3268 3268           * contains the mcg; if we detach blindly, the consumer
3269 3269           * who set up the other list will also stop receiving
3270 3270           * traffic.
3271 3271           */
3272 3272          if (jstate == IB_MC_JSTATE_FULL) {
3273 3273                  /*
3274 3274                   * The following check is only relevant while coming
3275 3275                   * from the Tx completion path in the reap case.
3276 3276                   */
3277 3277                  if (!mce->mc_fullreap)
3278 3278                          return;
3279 3279                  mutex_enter(&state->id_mc_mutex);
3280 3280                  IBD_MCACHE_PULLOUT_FULL(state, mce);
3281 3281                  mutex_exit(&state->id_mc_mutex);
3282 3282                  if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3283 3283                          do_detach = B_FALSE;
3284 3284          } else if (jstate == IB_MC_JSTATE_NON) {
3285 3285                  IBD_MCACHE_PULLOUT_NON(state, mce);
3286 3286                  tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3287 3287                  if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3288 3288                          do_detach = B_FALSE;
3289 3289          } else {        /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3290 3290                  mutex_enter(&state->id_mc_mutex);
3291 3291                  IBD_MCACHE_PULLOUT_FULL(state, mce);
3292 3292                  mutex_exit(&state->id_mc_mutex);
3293 3293                  do_detach = B_FALSE;
3294 3294          }
3295 3295  
3296 3296          /*
3297 3297           * If we are reacting to a mcg trap and leaving our sendonly or
3298 3298           * non membership, the mcg is possibly already gone, so attempting
3299 3299           * to leave might fail. On the other hand, we must try to leave
3300 3300           * anyway, since this might be a trap from long ago, and we could
3301 3301           * have potentially sendonly joined to a recent incarnation of
3302 3302           * the mcg and are about to loose track of this information.
3303 3303           */
3304 3304          if (do_detach) {
3305 3305                  DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3306 3306                      "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3307 3307                  (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3308 3308          }
3309 3309  
3310 3310          (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3311 3311          kmem_free(mce, sizeof (ibd_mce_t));
3312 3312  }
3313 3313  
3314 3314  /*
3315 3315   * Async code executed due to multicast and promiscuous disable requests
3316 3316   * and mcg trap handling; also executed during driver detach. Mostly, a
3317 3317   * leave and detach is done; except for the fullmember case when Tx
3318 3318   * requests are pending, whence arrangements are made for subsequent
3319 3319   * cleanup on Tx completion.
3320 3320   */
3321 3321  static void
3322 3322  ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3323 3323  {
3324 3324          ipoib_mac_t mcmac;
3325 3325          boolean_t recycled;
3326 3326          ibd_mce_t *mce;
3327 3327  
3328 3328          DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3329 3329              jstate, mgid.gid_prefix, mgid.gid_guid);
3330 3330  
3331 3331          if (jstate == IB_MC_JSTATE_NON) {
3332 3332                  recycled = B_TRUE;
3333 3333                  mce = IBD_MCACHE_FIND_NON(state, mgid);
3334 3334                  /*
3335 3335                   * In case we are handling a mcg trap, we might not find
3336 3336                   * the mcg in the non list.
3337 3337                   */
3338 3338                  if (mce == NULL) {
3339 3339                          return;
3340 3340                  }
3341 3341          } else {
3342 3342                  mce = IBD_MCACHE_FIND_FULL(state, mgid);
3343 3343  
3344 3344                  /*
3345 3345                   * In case we are handling a mcg trap, make sure the trap
3346 3346                   * is not arriving late; if we have an mce that indicates
3347 3347                   * that we are already a fullmember, that would be a clear
3348 3348                   * indication that the trap arrived late (ie, is for a
3349 3349                   * previous incarnation of the mcg).
3350 3350                   */
3351 3351                  if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3352 3352                          if ((mce == NULL) || (mce->mc_jstate ==
3353 3353                              IB_MC_JSTATE_FULL)) {
3354 3354                                  return;
3355 3355                          }
3356 3356                  } else {
3357 3357                          ASSERT(jstate == IB_MC_JSTATE_FULL);
3358 3358  
3359 3359                          /*
3360 3360                           * If join group failed, mce will be NULL here.
3361 3361                           * This is because in GLDv3 driver, set multicast
3362 3362                           *  will always return success.
3363 3363                           */
3364 3364                          if (mce == NULL) {
3365 3365                                  return;
3366 3366                          }
3367 3367  
3368 3368                          mce->mc_fullreap = B_TRUE;
3369 3369                  }
3370 3370  
3371 3371                  /*
3372 3372                   * If no pending Tx's remain that reference the AH
3373 3373                   * for the mcg, recycle it from active to free list.
3374 3374                   * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3375 3375                   * so the last completing Tx will cause an async reap
3376 3376                   * operation to be invoked, at which time we will drop our
3377 3377                   * membership to the mcg so that the pending Tx's complete
3378 3378                   * successfully. Refer to comments on "AH and MCE active
3379 3379                   * list manipulation" at top of this file. The lock protects
3380 3380                   * against Tx fast path and Tx cleanup code.
3381 3381                   */
3382 3382                  mutex_enter(&state->id_ac_mutex);
3383 3383                  ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3384 3384                  recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3385 3385                      IB_MC_JSTATE_SEND_ONLY_NON));
3386 3386                  mutex_exit(&state->id_ac_mutex);
3387 3387          }
3388 3388  
3389 3389          if (recycled) {
3390 3390                  DPRINT(2, "ibd_leave_group : leave_group reaping : "
3391 3391                      "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3392 3392                  ibd_async_reap_group(state, mce, mgid, jstate);
3393 3393          }
3394 3394  }
3395 3395  
3396 3396  /*
3397 3397   * Find the broadcast address as defined by IPoIB; implicitly
3398 3398   * determines the IBA scope, mtu, tclass etc of the link the
3399 3399   * interface is going to be a member of.
3400 3400   */
3401 3401  static ibt_status_t
3402 3402  ibd_find_bgroup(ibd_state_t *state)
3403 3403  {
3404 3404          ibt_mcg_attr_t mcg_attr;
3405 3405          uint_t numg;
3406 3406          uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3407 3407              IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3408 3408              IB_MC_SCOPE_GLOBAL };
3409 3409          int i, mcgmtu;
3410 3410          boolean_t found = B_FALSE;
3411 3411          int ret;
3412 3412          ibt_mcg_info_t mcg_info;
3413 3413  
3414 3414          state->id_bgroup_created = B_FALSE;
3415 3415          state->id_bgroup_present = B_FALSE;
3416 3416  
3417 3417  query_bcast_grp:
3418 3418          bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3419 3419          mcg_attr.mc_pkey = state->id_pkey;
3420 3420          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3421 3421          state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3422 3422          _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3423 3423  
3424 3424          for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3425 3425                  state->id_scope = mcg_attr.mc_scope = scopes[i];
3426 3426  
3427 3427                  /*
3428 3428                   * Look for the IPoIB broadcast group.
3429 3429                   */
3430 3430                  _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3431 3431                  state->id_mgid.gid_prefix =
3432 3432                      (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3433 3433                      ((uint64_t)state->id_scope << 48) |
3434 3434                      ((uint32_t)(state->id_pkey << 16)));
3435 3435                  mcg_attr.mc_mgid = state->id_mgid;
3436 3436                  _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3437 3437                  if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3438 3438                      &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3439 3439                          found = B_TRUE;
3440 3440                          break;
3441 3441                  }
3442 3442          }
3443 3443  
3444 3444          if (!found) {
3445 3445                  if (state->id_create_broadcast_group) {
3446 3446                          /*
3447 3447                           * If we created the broadcast group, but failed to
3448 3448                           * find it, we can't do anything except leave the
3449 3449                           * one we created and return failure.
3450 3450                           */
3451 3451                          if (state->id_bgroup_created) {
3452 3452                                  ibd_print_warn(state, "IPoIB broadcast group "
3453 3453                                      "absent. Unable to query after create.");
3454 3454                                  goto find_bgroup_fail;
3455 3455                          }
3456 3456  
3457 3457                          /*
3458 3458                           * Create the ipoib broadcast group if it didn't exist
3459 3459                           */
3460 3460                          bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3461 3461                          mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3462 3462                          mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3463 3463                          mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3464 3464                          mcg_attr.mc_pkey = state->id_pkey;
3465 3465                          mcg_attr.mc_flow = 0;
3466 3466                          mcg_attr.mc_sl = 0;
3467 3467                          mcg_attr.mc_tclass = 0;
3468 3468                          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3469 3469                          state->id_mgid.gid_prefix =
3470 3470                              (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3471 3471                              ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3472 3472                              ((uint32_t)(state->id_pkey << 16)));
3473 3473                          mcg_attr.mc_mgid = state->id_mgid;
3474 3474                          _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3475 3475  
3476 3476                          if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3477 3477                              &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3478 3478                                  ibd_print_warn(state, "IPoIB broadcast group "
3479 3479                                      "absent, create failed: ret = %d\n", ret);
3480 3480                                  state->id_bgroup_created = B_FALSE;
3481 3481                                  return (IBT_FAILURE);
3482 3482                          }
3483 3483                          state->id_bgroup_created = B_TRUE;
3484 3484                          goto query_bcast_grp;
3485 3485                  } else {
3486 3486                          ibd_print_warn(state, "IPoIB broadcast group absent");
3487 3487                          return (IBT_FAILURE);
3488 3488                  }
3489 3489          }
3490 3490  
3491 3491          /*
3492 3492           * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3493 3493           */
3494 3494          mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3495 3495          if (state->id_mtu < mcgmtu) {
3496 3496                  ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3497 3497                      "greater than port's maximum MTU %d", mcgmtu,
3498 3498                      state->id_mtu);
3499 3499                  ibt_free_mcg_info(state->id_mcinfo, 1);
3500 3500                  goto find_bgroup_fail;
3501 3501          }
3502 3502          state->id_mtu = mcgmtu;
3503 3503          state->id_bgroup_present = B_TRUE;
3504 3504  
3505 3505          return (IBT_SUCCESS);
3506 3506  
3507 3507  find_bgroup_fail:
3508 3508          if (state->id_bgroup_created) {
3509 3509                  (void) ibt_leave_mcg(state->id_sgid,
3510 3510                      mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3511 3511                      IB_MC_JSTATE_FULL);
3512 3512          }
3513 3513  
3514 3514          return (IBT_FAILURE);
3515 3515  }
3516 3516  
3517 3517  static int
3518 3518  ibd_alloc_tx_copybufs(ibd_state_t *state)
3519 3519  {
3520 3520          ibt_mr_attr_t mem_attr;
3521 3521  
3522 3522          /*
3523 3523           * Allocate one big chunk for all regular tx copy bufs
3524 3524           */
3525 3525          state->id_tx_buf_sz = state->id_mtu;
3526 3526          if (state->id_lso_policy && state->id_lso_capable &&
3527 3527              (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3528 3528                  state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3529 3529          }
3530 3530  
3531 3531          state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3532 3532              state->id_tx_buf_sz, KM_SLEEP);
3533 3533  
3534 3534          state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3535 3535              sizeof (ibd_swqe_t), KM_SLEEP);
3536 3536  
3537 3537          /*
3538 3538           * Do one memory registration on the entire txbuf area
3539 3539           */
3540 3540          mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3541 3541          mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3542 3542          mem_attr.mr_as = NULL;
3543 3543          mem_attr.mr_flags = IBT_MR_SLEEP;
3544 3544          if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3545 3545              &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3546 3546                  DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3547 3547                  kmem_free(state->id_tx_wqes,
3548 3548                      state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3549 3549                  kmem_free(state->id_tx_bufs,
3550 3550                      state->id_ud_num_swqe * state->id_tx_buf_sz);
3551 3551                  state->id_tx_bufs = NULL;
3552 3552                  return (DDI_FAILURE);
3553 3553          }
3554 3554  
3555 3555          return (DDI_SUCCESS);
3556 3556  }
3557 3557  
3558 3558  static int
3559 3559  ibd_alloc_tx_lsobufs(ibd_state_t *state)
3560 3560  {
3561 3561          ibt_mr_attr_t mem_attr;
3562 3562          ibd_lsobuf_t *buflist;
3563 3563          ibd_lsobuf_t *lbufp;
3564 3564          ibd_lsobuf_t *tail;
3565 3565          ibd_lsobkt_t *bktp;
3566 3566          uint8_t *membase;
3567 3567          uint8_t *memp;
3568 3568          uint_t memsz;
3569 3569          int i;
3570 3570  
3571 3571          /*
3572 3572           * Allocate the lso bucket
3573 3573           */
3574 3574          bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3575 3575  
3576 3576          /*
3577 3577           * Allocate the entire lso memory and register it
3578 3578           */
3579 3579          memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3580 3580          membase = kmem_zalloc(memsz, KM_SLEEP);
3581 3581  
3582 3582          mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3583 3583          mem_attr.mr_len = memsz;
3584 3584          mem_attr.mr_as = NULL;
3585 3585          mem_attr.mr_flags = IBT_MR_SLEEP;
3586 3586          if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3587 3587              &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3588 3588                  DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3589 3589                  kmem_free(membase, memsz);
3590 3590                  kmem_free(bktp, sizeof (ibd_lsobkt_t));
3591 3591                  return (DDI_FAILURE);
3592 3592          }
3593 3593  
3594 3594          mutex_enter(&state->id_lso_lock);
3595 3595  
3596 3596          /*
3597 3597           * Now allocate the buflist.  Note that the elements in the buflist and
3598 3598           * the buffers in the lso memory have a permanent 1-1 relation, so we
3599 3599           * can always derive the address of a buflist entry from the address of
3600 3600           * an lso buffer.
3601 3601           */
3602 3602          buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3603 3603              KM_SLEEP);
3604 3604  
3605 3605          /*
3606 3606           * Set up the lso buf chain
3607 3607           */
3608 3608          memp = membase;
3609 3609          lbufp = buflist;
3610 3610          for (i = 0; i < state->id_num_lso_bufs; i++) {
3611 3611                  lbufp->lb_isfree = 1;
3612 3612                  lbufp->lb_buf = memp;
3613 3613                  lbufp->lb_next = lbufp + 1;
3614 3614  
3615 3615                  tail = lbufp;
3616 3616  
3617 3617                  memp += IBD_LSO_BUFSZ;
3618 3618                  lbufp++;
3619 3619          }
3620 3620          tail->lb_next = NULL;
3621 3621  
3622 3622          /*
3623 3623           * Set up the LSO buffer information in ibd state
3624 3624           */
3625 3625          bktp->bkt_bufl = buflist;
3626 3626          bktp->bkt_free_head = buflist;
3627 3627          bktp->bkt_mem = membase;
3628 3628          bktp->bkt_nelem = state->id_num_lso_bufs;
3629 3629          bktp->bkt_nfree = bktp->bkt_nelem;
3630 3630  
3631 3631          state->id_lso = bktp;
3632 3632          mutex_exit(&state->id_lso_lock);
3633 3633  
3634 3634          return (DDI_SUCCESS);
3635 3635  }
3636 3636  
3637 3637  /*
3638 3638   * Statically allocate Tx buffer list(s).
3639 3639   */
3640 3640  static int
3641 3641  ibd_init_txlist(ibd_state_t *state)
3642 3642  {
3643 3643          ibd_swqe_t *swqe;
3644 3644          ibt_lkey_t lkey;
3645 3645          int i;
3646 3646          uint_t len;
3647 3647          uint8_t *bufaddr;
3648 3648  
3649 3649          if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3650 3650                  return (DDI_FAILURE);
3651 3651  
3652 3652          if (state->id_lso_policy && state->id_lso_capable) {
3653 3653                  if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3654 3654                          state->id_lso_capable = B_FALSE;
3655 3655          }
3656 3656  
3657 3657          mutex_enter(&state->id_tx_list.dl_mutex);
3658 3658          state->id_tx_list.dl_head = NULL;
3659 3659          state->id_tx_list.dl_pending_sends = B_FALSE;
3660 3660          state->id_tx_list.dl_cnt = 0;
3661 3661          mutex_exit(&state->id_tx_list.dl_mutex);
3662 3662          mutex_enter(&state->id_tx_rel_list.dl_mutex);
3663 3663          state->id_tx_rel_list.dl_head = NULL;
3664 3664          state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3665 3665          state->id_tx_rel_list.dl_cnt = 0;
3666 3666          mutex_exit(&state->id_tx_rel_list.dl_mutex);
3667 3667  
3668 3668          /*
3669 3669           * Allocate and setup the swqe list
3670 3670           */
3671 3671          lkey = state->id_tx_mr_desc.md_lkey;
3672 3672          bufaddr = state->id_tx_bufs;
3673 3673          len = state->id_tx_buf_sz;
3674 3674          swqe = state->id_tx_wqes;
3675 3675          mutex_enter(&state->id_tx_list.dl_mutex);
3676 3676          for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3677 3677                  swqe->swqe_next = NULL;
3678 3678                  swqe->swqe_im_mblk = NULL;
3679 3679  
3680 3680                  swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3681 3681                      bufaddr;
3682 3682                  swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3683 3683                  swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3684 3684  
3685 3685                  swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3686 3686                  swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3687 3687                  swqe->w_swr.wr_trans = IBT_UD_SRV;
3688 3688  
3689 3689                  /* These are set in send */
3690 3690                  swqe->w_swr.wr_nds = 0;
3691 3691                  swqe->w_swr.wr_sgl = NULL;
3692 3692                  swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3693 3693  
3694 3694                  /* add to list */
3695 3695                  state->id_tx_list.dl_cnt++;
3696 3696                  swqe->swqe_next = state->id_tx_list.dl_head;
3697 3697                  state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3698 3698          }
3699 3699          mutex_exit(&state->id_tx_list.dl_mutex);
3700 3700  
3701 3701          return (DDI_SUCCESS);
3702 3702  }
3703 3703  
3704 3704  static int
3705 3705  ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3706 3706      uint32_t *nds_p)
3707 3707  {
3708 3708          ibd_lsobkt_t *bktp;
3709 3709          ibd_lsobuf_t *lbufp;
3710 3710          ibd_lsobuf_t *nextp;
3711 3711          ibt_lkey_t lso_lkey;
3712 3712          uint_t frag_sz;
3713 3713          uint_t num_needed;
3714 3714          int i;
3715 3715  
3716 3716          ASSERT(sgl_p != NULL);
3717 3717          ASSERT(nds_p != NULL);
3718 3718          ASSERT(req_sz != 0);
3719 3719  
3720 3720          /*
3721 3721           * Determine how many bufs we'd need for the size requested
3722 3722           */
3723 3723          num_needed = req_sz / IBD_LSO_BUFSZ;
3724 3724          if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3725 3725                  num_needed++;
3726 3726  
3727 3727          mutex_enter(&state->id_lso_lock);
3728 3728  
3729 3729          /*
3730 3730           * If we don't have enough lso bufs, return failure
3731 3731           */
3732 3732          ASSERT(state->id_lso != NULL);
3733 3733          bktp = state->id_lso;
3734 3734          if (bktp->bkt_nfree < num_needed) {
3735 3735                  mutex_exit(&state->id_lso_lock);
3736 3736                  return (-1);
3737 3737          }
3738 3738  
3739 3739          /*
3740 3740           * Pick the first 'num_needed' bufs from the free list
3741 3741           */
3742 3742          lso_lkey = bktp->bkt_mr_desc.md_lkey;
3743 3743          lbufp = bktp->bkt_free_head;
3744 3744          for (i = 0; i < num_needed; i++) {
3745 3745                  ASSERT(lbufp->lb_isfree != 0);
3746 3746                  ASSERT(lbufp->lb_buf != NULL);
3747 3747  
3748 3748                  nextp = lbufp->lb_next;
3749 3749  
3750 3750                  sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3751 3751                  sgl_p[i].ds_key = lso_lkey;
3752 3752                  sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3753 3753  
3754 3754                  lbufp->lb_isfree = 0;
3755 3755                  lbufp->lb_next = NULL;
3756 3756  
3757 3757                  lbufp = nextp;
3758 3758          }
3759 3759          bktp->bkt_free_head = lbufp;
3760 3760  
3761 3761          /*
3762 3762           * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3763 3763           * to adjust the last sgl entry's length. Since we know we need atleast
3764 3764           * one, the i-1 use below is ok.
3765 3765           */
3766 3766          if (frag_sz) {
3767 3767                  sgl_p[i-1].ds_len = frag_sz;
3768 3768          }
3769 3769  
3770 3770          /*
3771 3771           * Update nfree count and return
3772 3772           */
3773 3773          bktp->bkt_nfree -= num_needed;
3774 3774  
3775 3775          mutex_exit(&state->id_lso_lock);
3776 3776  
3777 3777          *nds_p = num_needed;
3778 3778  
3779 3779          return (0);
3780 3780  }
3781 3781  
3782 3782  static void
3783 3783  ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3784 3784  {
3785 3785          ibd_lsobkt_t *bktp;
3786 3786          ibd_lsobuf_t *lbufp;
3787 3787          uint8_t *lso_mem_end;
3788 3788          uint_t ndx;
3789 3789          int i;
3790 3790  
3791 3791          mutex_enter(&state->id_lso_lock);
3792 3792  
3793 3793          bktp = state->id_lso;
3794 3794          ASSERT(bktp != NULL);
3795 3795  
3796 3796          lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3797 3797          for (i = 0; i < nds; i++) {
3798 3798                  uint8_t *va;
3799 3799  
3800 3800                  va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3801 3801                  ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3802 3802  
3803 3803                  /*
3804 3804                   * Figure out the buflist element this sgl buffer corresponds
3805 3805                   * to and put it back at the head
3806 3806                   */
3807 3807                  ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3808 3808                  lbufp = bktp->bkt_bufl + ndx;
3809 3809  
3810 3810                  ASSERT(lbufp->lb_isfree == 0);
3811 3811                  ASSERT(lbufp->lb_buf == va);
3812 3812  
3813 3813                  lbufp->lb_isfree = 1;
3814 3814                  lbufp->lb_next = bktp->bkt_free_head;
3815 3815                  bktp->bkt_free_head = lbufp;
3816 3816          }
3817 3817          bktp->bkt_nfree += nds;
3818 3818  
3819 3819          mutex_exit(&state->id_lso_lock);
3820 3820  }
3821 3821  
3822 3822  static void
3823 3823  ibd_free_tx_copybufs(ibd_state_t *state)
3824 3824  {
3825 3825          /*
3826 3826           * Unregister txbuf mr
3827 3827           */
3828 3828          if (ibt_deregister_mr(state->id_hca_hdl,
3829 3829              state->id_tx_mr_hdl) != IBT_SUCCESS) {
3830 3830                  DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3831 3831          }
3832 3832          state->id_tx_mr_hdl = NULL;
3833 3833  
3834 3834          /*
3835 3835           * Free txbuf memory
3836 3836           */
3837 3837          kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3838 3838              sizeof (ibd_swqe_t));
3839 3839          kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3840 3840              state->id_tx_buf_sz);
3841 3841          state->id_tx_wqes = NULL;
3842 3842          state->id_tx_bufs = NULL;
3843 3843  }
3844 3844  
3845 3845  static void
3846 3846  ibd_free_tx_lsobufs(ibd_state_t *state)
3847 3847  {
3848 3848          ibd_lsobkt_t *bktp;
3849 3849  
3850 3850          mutex_enter(&state->id_lso_lock);
3851 3851  
3852 3852          if ((bktp = state->id_lso) == NULL) {
3853 3853                  mutex_exit(&state->id_lso_lock);
3854 3854                  return;
3855 3855          }
3856 3856  
3857 3857          /*
3858 3858           * First, free the buflist
3859 3859           */
3860 3860          ASSERT(bktp->bkt_bufl != NULL);
3861 3861          kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3862 3862  
3863 3863          /*
3864 3864           * Unregister the LSO memory and free it
3865 3865           */
3866 3866          ASSERT(bktp->bkt_mr_hdl != NULL);
3867 3867          if (ibt_deregister_mr(state->id_hca_hdl,
3868 3868              bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3869 3869                  DPRINT(10,
3870 3870                      "ibd_free_lsobufs: ibt_deregister_mr failed");
3871 3871          }
3872 3872          ASSERT(bktp->bkt_mem);
3873 3873          kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3874 3874  
3875 3875          /*
3876 3876           * Finally free the bucket
3877 3877           */
3878 3878          kmem_free(bktp, sizeof (ibd_lsobkt_t));
3879 3879          state->id_lso = NULL;
3880 3880  
3881 3881          mutex_exit(&state->id_lso_lock);
3882 3882  }
3883 3883  
3884 3884  /*
3885 3885   * Free the statically allocated Tx buffer list.
3886 3886   */
3887 3887  static void
3888 3888  ibd_fini_txlist(ibd_state_t *state)
3889 3889  {
3890 3890          /*
3891 3891           * Free the allocated swqes
3892 3892           */
3893 3893          mutex_enter(&state->id_tx_list.dl_mutex);
3894 3894          mutex_enter(&state->id_tx_rel_list.dl_mutex);
3895 3895          state->id_tx_list.dl_head = NULL;
3896 3896          state->id_tx_list.dl_pending_sends = B_FALSE;
3897 3897          state->id_tx_list.dl_cnt = 0;
3898 3898          state->id_tx_rel_list.dl_head = NULL;
3899 3899          state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3900 3900          state->id_tx_rel_list.dl_cnt = 0;
3901 3901          mutex_exit(&state->id_tx_rel_list.dl_mutex);
3902 3902          mutex_exit(&state->id_tx_list.dl_mutex);
3903 3903  
3904 3904          ibd_free_tx_lsobufs(state);
3905 3905          ibd_free_tx_copybufs(state);
3906 3906  }
3907 3907  
3908 3908  /*
3909 3909   * post a list of rwqes, NULL terminated.
3910 3910   */
3911 3911  static void
3912 3912  ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3913 3913  {
3914 3914          uint_t          i;
3915 3915          uint_t          num_posted;
3916 3916          ibt_status_t    ibt_status;
3917 3917          ibt_recv_wr_t   wrs[IBD_RX_POST_CNT];
3918 3918  
3919 3919          while (rwqe) {
3920 3920                  /* Post up to IBD_RX_POST_CNT receive work requests */
3921 3921                  for (i = 0; i < IBD_RX_POST_CNT; i++) {
3922 3922                          wrs[i] = rwqe->w_rwr;
3923 3923                          rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3924 3924                          if (rwqe == NULL) {
3925 3925                                  i++;
3926 3926                                  break;
3927 3927                          }
3928 3928                  }
3929 3929  
3930 3930                  /*
3931 3931                   * If posting fails for some reason, we'll never receive
3932 3932                   * completion intimation, so we'll need to cleanup. But
3933 3933                   * we need to make sure we don't clean up nodes whose
3934 3934                   * wrs have been successfully posted. We assume that the
3935 3935                   * hca driver returns on the first failure to post and
3936 3936                   * therefore the first 'num_posted' entries don't need
3937 3937                   * cleanup here.
3938 3938                   */
3939 3939                  atomic_add_32(&state->id_rx_list.dl_cnt, i);
3940 3940  
3941 3941                  num_posted = 0;
3942 3942                  ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3943 3943                      &num_posted);
3944 3944                  if (ibt_status != IBT_SUCCESS) {
3945 3945                          /* This cannot happen unless the device has an error. */
3946 3946                          ibd_print_warn(state, "ibd_post_recv: FATAL: "
3947 3947                              "posting multiple wrs failed: "
3948 3948                              "requested=%d, done=%d, ret=%d",
3949 3949                              IBD_RX_POST_CNT, num_posted, ibt_status);
3950 3950                          atomic_add_32(&state->id_rx_list.dl_cnt,
3951 3951                              num_posted - i);
3952 3952                  }
3953 3953          }
3954 3954  }
3955 3955  
3956 3956  /*
3957 3957   * Grab a list of rwqes from the array of lists, and post the list.
3958 3958   */
3959 3959  static void
3960 3960  ibd_post_recv_intr(ibd_state_t *state)
3961 3961  {
3962 3962          ibd_rx_queue_t  *rxp;
3963 3963          ibd_rwqe_t *list;
3964 3964  
3965 3965          /* rotate through the rx_queue array, expecting an adequate number */
3966 3966          state->id_rx_post_queue_index =
3967 3967              (state->id_rx_post_queue_index + 1) &
3968 3968              (state->id_rx_nqueues - 1);
3969 3969  
3970 3970          rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3971 3971          mutex_enter(&rxp->rx_post_lock);
3972 3972          list = WQE_TO_RWQE(rxp->rx_head);
3973 3973          rxp->rx_head = NULL;
3974 3974          rxp->rx_cnt = 0;
3975 3975          mutex_exit(&rxp->rx_post_lock);
3976 3976          ibd_post_recv_list(state, list);
3977 3977  }
3978 3978  
3979 3979  /* macro explained below */
3980 3980  #define RX_QUEUE_HASH(rwqe) \
3981 3981          (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3982 3982  
3983 3983  /*
3984 3984   * Add a rwqe to one of the the Rx lists.  If the list is large enough
3985 3985   * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3986 3986   *
3987 3987   * Note: one of 2^N lists is chosen via a hash.  This is done
3988 3988   * because using one list is contentious.  If the first list is busy
3989 3989   * (mutex_tryenter fails), use a second list (just call mutex_enter).
3990 3990   *
3991 3991   * The number 8 in RX_QUEUE_HASH is a random choice that provides
3992 3992   * even distribution of mapping rwqes to the 2^N queues.
3993 3993   */
3994 3994  static void
3995 3995  ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3996 3996  {
3997 3997          ibd_rx_queue_t  *rxp;
3998 3998  
3999 3999          rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
4000 4000  
4001 4001          if (!mutex_tryenter(&rxp->rx_post_lock)) {
4002 4002                  /* Failed.  Try a different queue ("ptr + 16" ensures that). */
4003 4003                  rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
4004 4004                  mutex_enter(&rxp->rx_post_lock);
4005 4005          }
4006 4006          rwqe->rwqe_next = rxp->rx_head;
4007 4007          if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4008 4008                  uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4009 4009  
4010 4010                  /* only call ibt_post_recv() every Nth time through here */
4011 4011                  if ((active & (state->id_rx_nqueues - 1)) == 0) {
4012 4012                          rxp->rx_head = NULL;
4013 4013                          rxp->rx_cnt = 0;
4014 4014                          mutex_exit(&rxp->rx_post_lock);
4015 4015                          ibd_post_recv_list(state, rwqe);
4016 4016                          return;
4017 4017                  }
4018 4018          }
4019 4019          rxp->rx_head = RWQE_TO_WQE(rwqe);
4020 4020          mutex_exit(&rxp->rx_post_lock);
4021 4021  }
4022 4022  
4023 4023  static int
4024 4024  ibd_alloc_rx_copybufs(ibd_state_t *state)
4025 4025  {
4026 4026          ibt_mr_attr_t mem_attr;
4027 4027          int i;
4028 4028  
4029 4029          /*
4030 4030           * Allocate one big chunk for all regular rx copy bufs
4031 4031           */
4032 4032          state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4033 4033  
4034 4034          state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4035 4035              state->id_rx_buf_sz, KM_SLEEP);
4036 4036  
4037 4037          state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4038 4038              sizeof (ibd_rwqe_t), KM_SLEEP);
4039 4039  
4040 4040          state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4041 4041          state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4042 4042              sizeof (ibd_rx_queue_t), KM_SLEEP);
4043 4043          for (i = 0; i < state->id_rx_nqueues; i++) {
4044 4044                  ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4045 4045                  mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4046 4046          }
4047 4047  
4048 4048          /*
4049 4049           * Do one memory registration on the entire rxbuf area
4050 4050           */
4051 4051          mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4052 4052          mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4053 4053          mem_attr.mr_as = NULL;
4054 4054          mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4055 4055          if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4056 4056              &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4057 4057                  DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4058 4058                  kmem_free(state->id_rx_wqes,
4059 4059                      state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4060 4060                  kmem_free(state->id_rx_bufs,
4061 4061                      state->id_ud_num_rwqe * state->id_rx_buf_sz);
4062 4062                  state->id_rx_bufs = NULL;
4063 4063                  state->id_rx_wqes = NULL;
4064 4064                  return (DDI_FAILURE);
4065 4065          }
4066 4066  
4067 4067          return (DDI_SUCCESS);
4068 4068  }
4069 4069  
4070 4070  /*
4071 4071   * Allocate the statically allocated Rx buffer list.
4072 4072   */
4073 4073  static int
4074 4074  ibd_init_rxlist(ibd_state_t *state)
4075 4075  {
4076 4076          ibd_rwqe_t *rwqe, *next;
4077 4077          ibd_wqe_t *list;
4078 4078          ibt_lkey_t lkey;
4079 4079          int i;
4080 4080          uint_t len;
4081 4081          uint8_t *bufaddr;
4082 4082  
4083 4083          mutex_enter(&state->id_rx_free_list.dl_mutex);
4084 4084          if (state->id_rx_free_list.dl_head != NULL) {
4085 4085                  /* rx rsrcs were never freed.  Just repost them */
4086 4086                  len = state->id_rx_buf_sz;
4087 4087                  list = state->id_rx_free_list.dl_head;
4088 4088                  state->id_rx_free_list.dl_head = NULL;
4089 4089                  state->id_rx_free_list.dl_cnt = 0;
4090 4090                  mutex_exit(&state->id_rx_free_list.dl_mutex);
4091 4091                  for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4092 4092                      rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4093 4093                          if ((rwqe->rwqe_im_mblk = desballoc(
4094 4094                              rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4095 4095                              &rwqe->w_freemsg_cb)) == NULL) {
4096 4096                                  /* allow freemsg_cb to free the rwqes */
4097 4097                                  if (atomic_dec_32_nv(&state->id_running) != 0) {
4098 4098                                          cmn_err(CE_WARN, "ibd_init_rxlist: "
4099 4099                                              "id_running was not 1\n");
4100 4100                                  }
4101 4101                                  DPRINT(10, "ibd_init_rxlist : "
4102 4102                                      "failed in desballoc()");
4103 4103                                  for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4104 4104                                      rwqe = next) {
4105 4105                                          next = WQE_TO_RWQE(rwqe->rwqe_next);
4106 4106                                          if (rwqe->rwqe_im_mblk) {
4107 4107                                                  atomic_inc_32(&state->
4108 4108                                                      id_rx_list.
4109 4109                                                      dl_bufs_outstanding);
4110 4110                                                  freemsg(rwqe->rwqe_im_mblk);
4111 4111                                          } else
4112 4112                                                  ibd_free_rwqe(state, rwqe);
4113 4113                                  }
4114 4114                                  atomic_inc_32(&state->id_running);
4115 4115                                  return (DDI_FAILURE);
4116 4116                          }
4117 4117                  }
4118 4118                  ibd_post_recv_list(state, WQE_TO_RWQE(list));
4119 4119                  return (DDI_SUCCESS);
4120 4120          }
4121 4121          mutex_exit(&state->id_rx_free_list.dl_mutex);
4122 4122  
4123 4123          if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4124 4124                  return (DDI_FAILURE);
4125 4125  
4126 4126          /*
4127 4127           * Allocate and setup the rwqe list
4128 4128           */
4129 4129          len = state->id_rx_buf_sz;
4130 4130          lkey = state->id_rx_mr_desc.md_lkey;
4131 4131          rwqe = state->id_rx_wqes;
4132 4132          bufaddr = state->id_rx_bufs;
4133 4133          list = NULL;
4134 4134          for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4135 4135                  rwqe->w_state = state;
4136 4136                  rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4137 4137                  rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4138 4138  
4139 4139                  rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4140 4140  
4141 4141                  if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4142 4142                      &rwqe->w_freemsg_cb)) == NULL) {
4143 4143                          DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4144 4144                          /* allow freemsg_cb to free the rwqes */
4145 4145                          if (atomic_dec_32_nv(&state->id_running) != 0) {
4146 4146                                  cmn_err(CE_WARN, "ibd_init_rxlist: "
4147 4147                                      "id_running was not 1\n");
4148 4148                          }
4149 4149                          DPRINT(10, "ibd_init_rxlist : "
4150 4150                              "failed in desballoc()");
4151 4151                          for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4152 4152                              rwqe = next) {
4153 4153                                  next = WQE_TO_RWQE(rwqe->rwqe_next);
4154 4154                                  freemsg(rwqe->rwqe_im_mblk);
4155 4155                          }
4156 4156                          atomic_inc_32(&state->id_running);
4157 4157  
4158 4158                          /* remove reference to free'd rwqes */
4159 4159                          mutex_enter(&state->id_rx_free_list.dl_mutex);
4160 4160                          state->id_rx_free_list.dl_head = NULL;
4161 4161                          state->id_rx_free_list.dl_cnt = 0;
4162 4162                          mutex_exit(&state->id_rx_free_list.dl_mutex);
4163 4163  
4164 4164                          ibd_fini_rxlist(state);
4165 4165                          return (DDI_FAILURE);
4166 4166                  }
4167 4167  
4168 4168                  rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4169 4169                  rwqe->rwqe_copybuf.ic_sgl.ds_va =
4170 4170                      (ib_vaddr_t)(uintptr_t)bufaddr;
4171 4171                  rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4172 4172                  rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4173 4173                  rwqe->w_rwr.wr_nds = 1;
4174 4174                  rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4175 4175  
4176 4176                  rwqe->rwqe_next = list;
4177 4177                  list = RWQE_TO_WQE(rwqe);
4178 4178          }
4179 4179          ibd_post_recv_list(state, WQE_TO_RWQE(list));
4180 4180  
4181 4181          return (DDI_SUCCESS);
4182 4182  }
4183 4183  
4184 4184  static void
4185 4185  ibd_free_rx_copybufs(ibd_state_t *state)
4186 4186  {
4187 4187          int i;
4188 4188  
4189 4189          /*
4190 4190           * Unregister rxbuf mr
4191 4191           */
4192 4192          if (ibt_deregister_mr(state->id_hca_hdl,
4193 4193              state->id_rx_mr_hdl) != IBT_SUCCESS) {
4194 4194                  DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4195 4195          }
4196 4196          state->id_rx_mr_hdl = NULL;
4197 4197  
4198 4198          /*
4199 4199           * Free rxbuf memory
4200 4200           */
4201 4201          for (i = 0; i < state->id_rx_nqueues; i++) {
4202 4202                  ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4203 4203                  mutex_destroy(&rxp->rx_post_lock);
4204 4204          }
4205 4205          kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4206 4206              sizeof (ibd_rx_queue_t));
4207 4207          kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4208 4208              sizeof (ibd_rwqe_t));
4209 4209          kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4210 4210              state->id_rx_buf_sz);
4211 4211          state->id_rx_queues = NULL;
4212 4212          state->id_rx_wqes = NULL;
4213 4213          state->id_rx_bufs = NULL;
4214 4214  }
4215 4215  
4216 4216  static void
4217 4217  ibd_free_rx_rsrcs(ibd_state_t *state)
4218 4218  {
4219 4219          mutex_enter(&state->id_rx_free_list.dl_mutex);
4220 4220          if (state->id_rx_free_list.dl_head == NULL) {
4221 4221                  /* already freed */
4222 4222                  mutex_exit(&state->id_rx_free_list.dl_mutex);
4223 4223                  return;
4224 4224          }
4225 4225          ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4226 4226          ibd_free_rx_copybufs(state);
4227 4227          state->id_rx_free_list.dl_cnt = 0;
4228 4228          state->id_rx_free_list.dl_head = NULL;
4229 4229          mutex_exit(&state->id_rx_free_list.dl_mutex);
4230 4230  }
4231 4231  
4232 4232  /*
4233 4233   * Free the statically allocated Rx buffer list.
4234 4234   */
4235 4235  static void
4236 4236  ibd_fini_rxlist(ibd_state_t *state)
4237 4237  {
4238 4238          ibd_rwqe_t *rwqe;
4239 4239          int i;
4240 4240  
4241 4241          /* run through the rx_queue's, calling freemsg() */
4242 4242          for (i = 0; i < state->id_rx_nqueues; i++) {
4243 4243                  ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4244 4244                  mutex_enter(&rxp->rx_post_lock);
4245 4245                  for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4246 4246                      rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4247 4247                          freemsg(rwqe->rwqe_im_mblk);
4248 4248                          rxp->rx_cnt--;
4249 4249                  }
4250 4250                  rxp->rx_head = NULL;
4251 4251                  mutex_exit(&rxp->rx_post_lock);
4252 4252          }
4253 4253  
4254 4254          /* cannot free rx resources unless gld returned everything */
4255 4255          if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4256 4256                  ibd_free_rx_rsrcs(state);
4257 4257  }
4258 4258  
4259 4259  /*
4260 4260   * Free an allocated recv wqe.
4261 4261   */
4262 4262  /* ARGSUSED */
4263 4263  static void
4264 4264  ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4265 4265  {
4266 4266          /*
4267 4267           * desballoc() failed (no memory).
4268 4268           *
4269 4269           * This rwqe is placed on a free list so that it
4270 4270           * can be reinstated when memory is available.
4271 4271           *
4272 4272           * NOTE: no code currently exists to reinstate
4273 4273           * these "lost" rwqes.
4274 4274           */
4275 4275          mutex_enter(&state->id_rx_free_list.dl_mutex);
4276 4276          state->id_rx_free_list.dl_cnt++;
4277 4277          rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4278 4278          state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4279 4279          mutex_exit(&state->id_rx_free_list.dl_mutex);
4280 4280  }
4281 4281  
4282 4282  /*
4283 4283   * IBA Rx completion queue handler. Guaranteed to be single
4284 4284   * threaded and nonreentrant for this CQ.
4285 4285   */
4286 4286  /* ARGSUSED */
4287 4287  static void
4288 4288  ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4289 4289  {
4290 4290          ibd_state_t *state = (ibd_state_t *)arg;
4291 4291  
4292 4292          atomic_inc_64(&state->id_num_intrs);
4293 4293  
4294 4294          if (ibd_rx_softintr == 1) {
4295 4295                  mutex_enter(&state->id_rcq_poll_lock);
4296 4296                  if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4297 4297                          state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4298 4298                          mutex_exit(&state->id_rcq_poll_lock);
4299 4299                          return;
4300 4300                  } else {
4301 4301                          mutex_exit(&state->id_rcq_poll_lock);
4302 4302                          ddi_trigger_softintr(state->id_rx);
4303 4303                  }
4304 4304          } else
4305 4305                  (void) ibd_intr((caddr_t)state);
4306 4306  }
4307 4307  
4308 4308  /*
4309 4309   * CQ handler for Tx completions, when the Tx CQ is in
4310 4310   * interrupt driven mode.
4311 4311   */
4312 4312  /* ARGSUSED */
4313 4313  static void
4314 4314  ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4315 4315  {
4316 4316          ibd_state_t *state = (ibd_state_t *)arg;
4317 4317  
4318 4318          atomic_inc_64(&state->id_num_intrs);
4319 4319  
4320 4320          if (ibd_tx_softintr == 1) {
4321 4321                  mutex_enter(&state->id_scq_poll_lock);
4322 4322                  if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4323 4323                          state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4324 4324                          mutex_exit(&state->id_scq_poll_lock);
4325 4325                          return;
4326 4326                  } else {
4327 4327                          mutex_exit(&state->id_scq_poll_lock);
4328 4328                          ddi_trigger_softintr(state->id_tx);
4329 4329                  }
4330 4330          } else
4331 4331                  (void) ibd_tx_recycle((caddr_t)state);
4332 4332  }
4333 4333  
4334 4334  /*
4335 4335   * Multicast group create/delete trap handler. These will be delivered
4336 4336   * on a kernel thread (handling can thus block) and can be invoked
4337 4337   * concurrently. The handler can be invoked anytime after it is
4338 4338   * registered and before ibt_detach().
4339 4339   */
4340 4340  /* ARGSUSED */
4341 4341  static void
4342 4342  ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4343 4343      ibt_subnet_event_t *event)
4344 4344  {
4345 4345          ibd_state_t *state = (ibd_state_t *)arg;
4346 4346          ibd_req_t *req;
4347 4347  
4348 4348          /*
4349 4349           * The trap handler will get invoked once for every event for
4350 4350           * every port. The input "gid" is the GID0 of the port the
4351 4351           * trap came in on; we just need to act on traps that came
4352 4352           * to our port, meaning the port on which the ipoib interface
4353 4353           * resides. Since ipoib uses GID0 of the port, we just match
4354 4354           * the gids to check whether we need to handle the trap.
4355 4355           */
4356 4356          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4357 4357          if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4358 4358                  return;
4359 4359          _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4360 4360  
4361 4361          DPRINT(10, "ibd_notices_handler : %d\n", code);
4362 4362  
4363 4363          switch (code) {
4364 4364                  case IBT_SM_EVENT_UNAVAILABLE:
4365 4365                          /*
4366 4366                           * If we are in promiscuous mode or have
4367 4367                           * sendnonmembers, we need to print a warning
4368 4368                           * message right now. Else, just store the
4369 4369                           * information, print when we enter promiscuous
4370 4370                           * mode or attempt nonmember send. We might
4371 4371                           * also want to stop caching sendnonmember.
4372 4372                           */
4373 4373                          ibd_print_warn(state, "IBA multicast support "
4374 4374                              "degraded due to unavailability of multicast "
4375 4375                              "traps");
4376 4376                          break;
4377 4377                  case IBT_SM_EVENT_AVAILABLE:
4378 4378                          /*
4379 4379                           * If we printed a warning message above or
4380 4380                           * while trying to nonmember send or get into
4381 4381                           * promiscuous mode, print an okay message.
4382 4382                           */
4383 4383                          ibd_print_warn(state, "IBA multicast support "
4384 4384                              "restored due to availability of multicast "
4385 4385                              "traps");
4386 4386                          break;
4387 4387                  case IBT_SM_EVENT_MCG_CREATED:
4388 4388                  case IBT_SM_EVENT_MCG_DELETED:
4389 4389                          /*
4390 4390                           * If it is a "deleted" event and we are in late hca
4391 4391                           * init, nothing to do.
4392 4392                           */
4393 4393                          if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4394 4394                              IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4395 4395                              IBT_SM_EVENT_MCG_DELETED)) {
4396 4396                                  break;
4397 4397                          }
4398 4398                          /*
4399 4399                           * Common processing of creation/deletion traps.
4400 4400                           * First check if the instance is being
4401 4401                           * [de]initialized; back off then, without doing
4402 4402                           * anything more, since we are not sure if the
4403 4403                           * async thread is around, or whether we might
4404 4404                           * be racing with the detach code in ibd_m_stop()
4405 4405                           * that scans the mcg list.
4406 4406                           */
4407 4407                          if (!ibd_async_safe(state))
4408 4408                                  return;
4409 4409  
4410 4410                          req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4411 4411                          req->rq_gid = event->sm_notice_gid;
4412 4412                          req->rq_ptr = (void *)code;
4413 4413                          ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4414 4414                          break;
4415 4415          }
4416 4416  }
4417 4417  
4418 4418  static void
4419 4419  ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4420 4420  {
4421 4421          ib_gid_t mgid = req->rq_gid;
4422 4422          ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4423 4423          int ret;
4424 4424          ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4425 4425  
4426 4426          DPRINT(10, "ibd_async_trap : %d\n", code);
4427 4427  
4428 4428          /*
4429 4429           * Check if we have already joined the IPoIB broadcast group for our
4430 4430           * PKEY. If joined, perform the rest of the operation.
4431 4431           * Else, the interface is not initialised. Do the initialisation here
4432 4432           * by calling ibd_start() and return.
4433 4433           */
4434 4434  
4435 4435          if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4436 4436              IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4437 4437              (code == IBT_SM_EVENT_MCG_CREATED)) {
4438 4438                  /*
4439 4439                   * If we are in late HCA init and a notification for the
4440 4440                   * creation of a MCG came in, check if it is the IPoIB MCG for
4441 4441                   * this pkey. If not, return.
4442 4442                   */
4443 4443                  if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4444 4444                      state->id_pkey)) {
4445 4445                          ibd_async_done(state);
4446 4446                          return;
4447 4447                  }
4448 4448                  ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4449 4449                  /*
4450 4450                   * Check if there is still a necessity to start the interface.
4451 4451                   * It is possible that the user attempted unplumb at just about
4452 4452                   * the same time, and if unplumb succeeded, we have nothing to
4453 4453                   * do.
4454 4454                   */
4455 4455                  if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4456 4456                      IBD_DRV_IN_LATE_HCA_INIT) &&
4457 4457                      ((ret = ibd_start(state)) != 0)) {
4458 4458                          DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4459 4459                              "init, ret=%d", ret);
4460 4460                  }
4461 4461                  ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4462 4462                  ibd_async_done(state);
4463 4463                  return;
4464 4464          }
4465 4465  
4466 4466          /*
4467 4467           * Atomically search the nonmember and sendonlymember lists and
4468 4468           * delete.
4469 4469           */
4470 4470          ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4471 4471  
4472 4472          if (state->id_prom_op == IBD_OP_COMPLETED) {
4473 4473                  ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4474 4474  
4475 4475                  /*
4476 4476                   * If in promiscuous mode, try to join/attach to the new
4477 4477                   * mcg. Given the unreliable out-of-order mode of trap
4478 4478                   * delivery, we can never be sure whether it is a problem
4479 4479                   * if the join fails. Thus, we warn the admin of a failure
4480 4480                   * if this was a creation trap. Note that the trap might
4481 4481                   * actually be reporting a long past event, and the mcg
4482 4482                   * might already have been deleted, thus we might be warning
4483 4483                   * in vain.
4484 4484                   */
4485 4485                  if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4486 4486                      NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4487 4487                          ibd_print_warn(state, "IBA promiscuous mode missed "
4488 4488                              "new multicast gid %016llx:%016llx",
4489 4489                              (u_longlong_t)mgid.gid_prefix,
4490 4490                              (u_longlong_t)mgid.gid_guid);
4491 4491          }
4492 4492  
4493 4493          /*
4494 4494           * Free the request slot allocated by the subnet event thread.
4495 4495           */
4496 4496          ibd_async_done(state);
4497 4497  }
4498 4498  
4499 4499  /*
4500 4500   * GLDv3 entry point to get capabilities.
4501 4501   */
4502 4502  static boolean_t
4503 4503  ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4504 4504  {
4505 4505          ibd_state_t *state = arg;
4506 4506  
4507 4507          if (state->id_type == IBD_PORT_DRIVER)
4508 4508                  return (B_FALSE);
4509 4509  
4510 4510          switch (cap) {
4511 4511          case MAC_CAPAB_HCKSUM: {
4512 4512                  uint32_t *txflags = cap_data;
4513 4513  
4514 4514                  /*
4515 4515                   * We either do full checksum or not do it at all
4516 4516                   */
4517 4517                  if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4518 4518                          *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4519 4519                  else
4520 4520                          return (B_FALSE);
4521 4521                  break;
4522 4522          }
4523 4523  
4524 4524          case MAC_CAPAB_LSO: {
4525 4525                  mac_capab_lso_t *cap_lso = cap_data;
4526 4526  
4527 4527                  /*
4528 4528                   * In addition to the capability and policy, since LSO
4529 4529                   * relies on hw checksum, we'll not enable LSO if we
4530 4530                   * don't have hw checksum.  Of course, if the HCA doesn't
4531 4531                   * provide the reserved lkey capability, enabling LSO will
4532 4532                   * actually affect performance adversely, so we'll disable
4533 4533                   * LSO even for that case.
4534 4534                   */
4535 4535                  if (!state->id_lso_policy || !state->id_lso_capable)
4536 4536                          return (B_FALSE);
4537 4537  
4538 4538                  if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4539 4539                          return (B_FALSE);
4540 4540  
4541 4541                  if (state->id_hca_res_lkey_capab == 0) {
4542 4542                          ibd_print_warn(state, "no reserved-lkey capability, "
4543 4543                              "disabling LSO");
4544 4544                          return (B_FALSE);
4545 4545                  }
4546 4546  
4547 4547                  cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4548 4548                  cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4549 4549                  break;
4550 4550          }
4551 4551  
4552 4552          default:
4553 4553                  return (B_FALSE);
4554 4554          }
4555 4555  
4556 4556          return (B_TRUE);
4557 4557  }
4558 4558  
4559 4559  /*
4560 4560   * callback function for set/get of properties
4561 4561   */
4562 4562  static int
4563 4563  ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4564 4564      uint_t pr_valsize, const void *pr_val)
4565 4565  {
4566 4566          ibd_state_t *state = arg;
4567 4567          int err = 0;
4568 4568          uint32_t link_mode;
4569 4569  
4570 4570          /* Cannot set properties on a port driver */
4571 4571          if (state->id_type == IBD_PORT_DRIVER) {
4572 4572                  return (ENOTSUP);
4573 4573          }
4574 4574  
4575 4575          switch (pr_num) {
4576 4576                  case MAC_PROP_IB_LINKMODE:
4577 4577                          if (state->id_mac_state & IBD_DRV_STARTED) {
4578 4578                                  err = EBUSY;
4579 4579                                  break;
4580 4580                          }
4581 4581                          if (pr_val == NULL) {
4582 4582                                  err = EINVAL;
4583 4583                                  break;
4584 4584                          }
4585 4585                          bcopy(pr_val, &link_mode, sizeof (link_mode));
4586 4586                          if (link_mode != IBD_LINK_MODE_UD &&
4587 4587                              link_mode != IBD_LINK_MODE_RC) {
4588 4588                                  err = EINVAL;
4589 4589                          } else {
4590 4590                                  if (link_mode == IBD_LINK_MODE_RC) {
4591 4591                                          if (state->id_enable_rc) {
4592 4592                                                  return (0);
4593 4593                                          }
4594 4594                                          state->id_enable_rc = 1;
4595 4595                                          /* inform MAC framework of new MTU */
4596 4596                                          err = mac_maxsdu_update2(state->id_mh,
4597 4597                                              state->rc_mtu - IPOIB_HDRSIZE,
4598 4598                                              state->id_mtu - IPOIB_HDRSIZE);
4599 4599                                  } else {
4600 4600                                          if (!state->id_enable_rc) {
4601 4601                                                  return (0);
4602 4602                                          }
4603 4603                                          state->id_enable_rc = 0;
4604 4604                                          err = mac_maxsdu_update2(state->id_mh,
4605 4605                                              state->id_mtu - IPOIB_HDRSIZE,
4606 4606                                              state->id_mtu - IPOIB_HDRSIZE);
4607 4607                                  }
4608 4608                                  (void) ibd_record_capab(state);
4609 4609                                  mac_capab_update(state->id_mh);
4610 4610                          }
4611 4611                          break;
4612 4612                  case MAC_PROP_PRIVATE:
4613 4613                          err = ibd_set_priv_prop(state, pr_name,
4614 4614                              pr_valsize, pr_val);
4615 4615                          break;
4616 4616                  default:
4617 4617                          err = ENOTSUP;
4618 4618                          break;
4619 4619          }
4620 4620          return (err);
4621 4621  }
4622 4622  
4623 4623  static int
4624 4624  ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4625 4625      uint_t pr_valsize, void *pr_val)
4626 4626  {
4627 4627          ibd_state_t *state = arg;
4628 4628          int err = 0;
4629 4629  
4630 4630          switch (pr_num) {
4631 4631                  case MAC_PROP_MTU:
4632 4632                          break;
4633 4633                  default:
4634 4634                          if (state->id_type == IBD_PORT_DRIVER) {
4635 4635                                  return (ENOTSUP);
4636 4636                          }
4637 4637                          break;
4638 4638          }
4639 4639  
4640 4640          switch (pr_num) {
4641 4641                  case MAC_PROP_IB_LINKMODE:
4642 4642                          *(uint_t *)pr_val = state->id_enable_rc;
4643 4643                          break;
4644 4644                  case MAC_PROP_PRIVATE:
4645 4645                          err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4646 4646                              pr_val);
4647 4647                          break;
4648 4648                  default:
4649 4649                          err = ENOTSUP;
4650 4650                          break;
4651 4651          }
4652 4652          return (err);
4653 4653  }
4654 4654  
4655 4655  static void
4656 4656  ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4657 4657      mac_prop_info_handle_t prh)
4658 4658  {
4659 4659          ibd_state_t *state = arg;
4660 4660  
4661 4661          switch (pr_num) {
4662 4662          case MAC_PROP_IB_LINKMODE: {
4663 4663                  mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4664 4664                  break;
4665 4665          }
4666 4666          case MAC_PROP_MTU: {
4667 4667                  uint32_t min, max;
4668 4668                  if (state->id_type == IBD_PORT_DRIVER) {
4669 4669                          min = 1500;
4670 4670                          max = IBD_DEF_RC_MAX_SDU;
4671 4671                  } else if (state->id_enable_rc) {
4672 4672                          min = max = IBD_DEF_RC_MAX_SDU;
4673 4673                  } else {
4674 4674                          min = max = state->id_mtu - IPOIB_HDRSIZE;
4675 4675                  }
4676 4676                  mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4677 4677                  mac_prop_info_set_range_uint32(prh, min, max);
4678 4678                  break;
4679 4679          }
4680 4680          case MAC_PROP_PRIVATE: {
4681 4681                  char valstr[64];
4682 4682                  int value;
4683 4683  
4684 4684                  if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4685 4685                          mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4686 4686                          return;
4687 4687                  } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4688 4688                          value = IBD_DEF_COALESCE_COMPLETIONS;
4689 4689                  } else if (strcmp(pr_name,
4690 4690                      "_ibd_create_broadcast_group") == 0) {
4691 4691                          value = IBD_DEF_CREATE_BCAST_GROUP;
4692 4692                  } else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4693 4693                          value = IBD_DEF_HASH_SIZE;
4694 4694                  } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4695 4695                          value = IBD_DEF_LSO_POLICY;
4696 4696                  } else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4697 4697                          value = IBD_DEF_NUM_AH;
4698 4698                  } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4699 4699                          value = IBD_DEF_NUM_LSO_BUFS;
4700 4700                  } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4701 4701                          value = IBD_DEF_RC_ENABLE_SRQ;
4702 4702                  } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4703 4703                          value = IBD_DEF_RC_NUM_RWQE;
4704 4704                  } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4705 4705                          value = IBD_DEF_RC_NUM_SRQ;
4706 4706                  } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4707 4707                          value = IBD_DEF_RC_NUM_SWQE;
4708 4708                  } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4709 4709                          value = IBD_DEF_RC_RX_COMP_COUNT;
4710 4710                  } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4711 4711                          value = IBD_DEF_RC_RX_COMP_USEC;
4712 4712                  } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4713 4713                          value = IBD_DEF_RC_RX_COPY_THRESH;
4714 4714                  } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4715 4715                          value = IBD_DEF_RC_RX_RWQE_THRESH;
4716 4716                  } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4717 4717                          value = IBD_DEF_RC_TX_COMP_COUNT;
4718 4718                  } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4719 4719                          value = IBD_DEF_RC_TX_COMP_USEC;
4720 4720                  } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4721 4721                          value = IBD_DEF_RC_TX_COPY_THRESH;
4722 4722                  } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4723 4723                          value = IBD_DEF_UD_NUM_RWQE;
4724 4724                  } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4725 4725                          value = IBD_DEF_UD_NUM_SWQE;
4726 4726                  } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4727 4727                          value = IBD_DEF_UD_RX_COMP_COUNT;
4728 4728                  } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4729 4729                          value = IBD_DEF_UD_RX_COMP_USEC;
4730 4730                  } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4731 4731                          value = IBD_DEF_UD_TX_COMP_COUNT;
4732 4732                  } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4733 4733                          value = IBD_DEF_UD_TX_COMP_USEC;
4734 4734                  } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4735 4735                          value = IBD_DEF_UD_TX_COPY_THRESH;
4736 4736                  } else {
4737 4737                          return;
4738 4738                  }
4739 4739  
4740 4740                  (void) snprintf(valstr, sizeof (valstr), "%d", value);
4741 4741                  mac_prop_info_set_default_str(prh, valstr);
4742 4742                  break;
4743 4743          }
4744 4744          } /* switch (pr_num) */
4745 4745  }
4746 4746  
4747 4747  /* ARGSUSED2 */
4748 4748  static int
4749 4749  ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4750 4750      uint_t pr_valsize, const void *pr_val)
4751 4751  {
4752 4752          int err = 0;
4753 4753          long result;
4754 4754  
4755 4755          if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4756 4756                  if (pr_val == NULL) {
4757 4757                          return (EINVAL);
4758 4758                  }
4759 4759                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4760 4760                  if (result < 0 || result > 1) {
4761 4761                          err = EINVAL;
4762 4762                  } else {
4763 4763                          state->id_allow_coalesce_comp_tuning = (result == 1) ?
4764 4764                              B_TRUE: B_FALSE;
4765 4765                  }
4766 4766                  return (err);
4767 4767          }
4768 4768          if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4769 4769                  if (state->id_mac_state & IBD_DRV_STARTED) {
4770 4770                          return (EBUSY);
4771 4771                  }
4772 4772                  if (pr_val == NULL) {
4773 4773                          return (EINVAL);
4774 4774                  }
4775 4775                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4776 4776                  if (result < 0 || result > 1) {
4777 4777                          err = EINVAL;
4778 4778                  } else {
4779 4779                          state->id_create_broadcast_group = (result == 1) ?
4780 4780                              B_TRUE: B_FALSE;
4781 4781                  }
4782 4782                  return (err);
4783 4783          }
4784 4784          if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4785 4785                  if (state->id_mac_state & IBD_DRV_STARTED) {
4786 4786                          return (EBUSY);
4787 4787                  }
4788 4788                  if (pr_val == NULL) {
4789 4789                          return (EINVAL);
4790 4790                  }
4791 4791                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4792 4792                  if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4793 4793                          err = EINVAL;
4794 4794                  } else {
4795 4795                          state->id_hash_size = (uint32_t)result;
4796 4796                  }
4797 4797                  return (err);
4798 4798          }
4799 4799          if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4800 4800                  if (state->id_mac_state & IBD_DRV_STARTED) {
4801 4801                          return (EBUSY);
4802 4802                  }
4803 4803                  if (pr_val == NULL) {
4804 4804                          return (EINVAL);
4805 4805                  }
4806 4806                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4807 4807                  if (result < 0 || result > 1) {
4808 4808                          err = EINVAL;
4809 4809                  } else {
4810 4810                          state->id_lso_policy = (result == 1) ?
4811 4811                              B_TRUE: B_FALSE;
4812 4812                  }
4813 4813                  mac_capab_update(state->id_mh);
4814 4814                  return (err);
4815 4815          }
4816 4816          if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4817 4817                  if (state->id_mac_state & IBD_DRV_STARTED) {
4818 4818                          return (EBUSY);
4819 4819                  }
4820 4820                  if (pr_val == NULL) {
4821 4821                          return (EINVAL);
4822 4822                  }
4823 4823                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4824 4824                  if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4825 4825                          err = EINVAL;
4826 4826                  } else {
4827 4827                          state->id_num_ah = (uint32_t)result;
4828 4828                  }
4829 4829                  return (err);
4830 4830          }
4831 4831          if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4832 4832                  if (state->id_mac_state & IBD_DRV_STARTED) {
4833 4833                          return (EBUSY);
4834 4834                  }
4835 4835                  if (!state->id_lso_policy || !state->id_lso_capable) {
4836 4836                          return (EINVAL);
4837 4837                  }
4838 4838                  if (pr_val == NULL) {
4839 4839                          return (EINVAL);
4840 4840                  }
4841 4841                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4842 4842                  if (result < IBD_MIN_NUM_LSO_BUFS ||
4843 4843                      result > IBD_MAX_NUM_LSO_BUFS) {
4844 4844                          err = EINVAL;
4845 4845                  } else {
4846 4846                          state->id_num_lso_bufs = (uint32_t)result;
4847 4847                  }
4848 4848                  return (err);
4849 4849          }
4850 4850          if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4851 4851                  if (state->id_mac_state & IBD_DRV_STARTED) {
4852 4852                          return (EBUSY);
4853 4853                  }
4854 4854                  if (pr_val == NULL) {
4855 4855                          return (EINVAL);
4856 4856                  }
4857 4857                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4858 4858                  if (result < 0 || result > 1) {
4859 4859                          err = EINVAL;
4860 4860                  } else {
4861 4861                          state->rc_enable_srq = (result == 1) ?
4862 4862                              B_TRUE: B_FALSE;
4863 4863                  }
4864 4864                  if (!state->rc_enable_srq) {
4865 4865                          state->id_rc_num_srq = 0;
4866 4866                  }
4867 4867                  return (err);
4868 4868          }
4869 4869          if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4870 4870                  if (state->id_mac_state & IBD_DRV_STARTED) {
4871 4871                          return (EBUSY);
4872 4872                  }
4873 4873                  if (pr_val == NULL) {
4874 4874                          return (EINVAL);
4875 4875                  }
4876 4876                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4877 4877                  if (result < IBD_MIN_RC_NUM_RWQE ||
4878 4878                      result > IBD_MAX_RC_NUM_RWQE) {
4879 4879                          err = EINVAL;
4880 4880                  } else {
4881 4881                          state->id_rc_num_rwqe = (uint32_t)result;
4882 4882                          if (state->id_allow_coalesce_comp_tuning &&
4883 4883                              state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4884 4884                                  state->id_rc_rx_comp_count =
4885 4885                                      state->id_rc_num_rwqe;
4886 4886                          if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4887 4887                                  state->id_rc_num_srq =
4888 4888                                      state->id_rc_num_rwqe - 1;
4889 4889                          /*
4890 4890                           * If rx_rwqe_threshold is greater than the number of
4891 4891                           * rwqes, pull it back to 25% of number of rwqes.
4892 4892                           */
4893 4893                          if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4894 4894                                  state->id_rc_rx_rwqe_thresh =
4895 4895                                      (state->id_rc_num_rwqe >> 2);
4896 4896  
4897 4897                  }
4898 4898                  return (err);
4899 4899          }
4900 4900          if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4901 4901                  if (state->id_mac_state & IBD_DRV_STARTED) {
4902 4902                          return (EBUSY);
4903 4903                  }
4904 4904                  if (pr_val == NULL) {
4905 4905                          return (EINVAL);
4906 4906                  }
4907 4907                  if (!state->rc_enable_srq)
4908 4908                          return (EINVAL);
4909 4909  
4910 4910                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4911 4911                  if (result < IBD_MIN_RC_NUM_SRQ ||
4912 4912                      result >= state->id_rc_num_rwqe) {
4913 4913                          err = EINVAL;
4914 4914                  } else
4915 4915                          state->id_rc_num_srq = (uint32_t)result;
4916 4916                  return (err);
4917 4917          }
4918 4918          if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4919 4919                  if (state->id_mac_state & IBD_DRV_STARTED) {
4920 4920                          return (EBUSY);
4921 4921                  }
4922 4922                  if (pr_val == NULL) {
4923 4923                          return (EINVAL);
4924 4924                  }
4925 4925                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4926 4926                  if (result < IBD_MIN_RC_NUM_SWQE ||
4927 4927                      result > IBD_MAX_RC_NUM_SWQE) {
4928 4928                          err = EINVAL;
4929 4929                  } else {
4930 4930                          state->id_rc_num_swqe = (uint32_t)result;
4931 4931                          if (state->id_allow_coalesce_comp_tuning &&
4932 4932                              state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4933 4933                                  state->id_rc_tx_comp_count =
4934 4934                                      state->id_rc_num_swqe;
4935 4935                  }
4936 4936                  return (err);
4937 4937          }
4938 4938          if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4939 4939                  if (!state->id_allow_coalesce_comp_tuning) {
4940 4940                          return (ENOTSUP);
4941 4941                  }
4942 4942                  if (pr_val == NULL) {
4943 4943                          return (EINVAL);
4944 4944                  }
4945 4945                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4946 4946                  if (result < 1 || result > state->id_rc_num_rwqe) {
4947 4947                          err = EINVAL;
4948 4948                  } else {
4949 4949                          state->id_rc_rx_comp_count = (uint32_t)result;
4950 4950                  }
4951 4951                  return (err);
4952 4952          }
4953 4953          if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4954 4954                  if (!state->id_allow_coalesce_comp_tuning) {
4955 4955                          return (ENOTSUP);
4956 4956                  }
4957 4957                  if (pr_val == NULL) {
4958 4958                          return (EINVAL);
4959 4959                  }
4960 4960                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4961 4961                  if (result < 1) {
4962 4962                          err = EINVAL;
4963 4963                  } else {
4964 4964                          state->id_rc_rx_comp_usec = (uint32_t)result;
4965 4965                  }
4966 4966                  return (err);
4967 4967          }
4968 4968          if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4969 4969                  if (state->id_mac_state & IBD_DRV_STARTED) {
4970 4970                          return (EBUSY);
4971 4971                  }
4972 4972                  if (pr_val == NULL) {
4973 4973                          return (EINVAL);
4974 4974                  }
4975 4975                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4976 4976                  if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4977 4977                      result > state->rc_mtu) {
4978 4978                          err = EINVAL;
4979 4979                  } else {
4980 4980                          state->id_rc_rx_copy_thresh = (uint32_t)result;
4981 4981                  }
4982 4982                  return (err);
4983 4983          }
4984 4984          if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4985 4985                  if (state->id_mac_state & IBD_DRV_STARTED) {
4986 4986                          return (EBUSY);
4987 4987                  }
4988 4988                  if (pr_val == NULL) {
4989 4989                          return (EINVAL);
4990 4990                  }
4991 4991                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4992 4992                  if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4993 4993                      result >= state->id_rc_num_rwqe) {
4994 4994                          err = EINVAL;
4995 4995                  } else {
4996 4996                          state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4997 4997                  }
4998 4998                  return (err);
4999 4999          }
5000 5000          if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5001 5001                  if (!state->id_allow_coalesce_comp_tuning) {
5002 5002                          return (ENOTSUP);
5003 5003                  }
5004 5004                  if (pr_val == NULL) {
5005 5005                          return (EINVAL);
5006 5006                  }
5007 5007                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5008 5008                  if (result < 1 || result > state->id_rc_num_swqe) {
5009 5009                          err = EINVAL;
5010 5010                  } else {
5011 5011                          state->id_rc_tx_comp_count = (uint32_t)result;
5012 5012                  }
5013 5013                  return (err);
5014 5014          }
5015 5015          if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5016 5016                  if (!state->id_allow_coalesce_comp_tuning) {
5017 5017                          return (ENOTSUP);
5018 5018                  }
5019 5019                  if (pr_val == NULL) {
5020 5020                          return (EINVAL);
5021 5021                  }
5022 5022                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5023 5023                  if (result < 1)
5024 5024                          err = EINVAL;
5025 5025                  else {
5026 5026                          state->id_rc_tx_comp_usec = (uint32_t)result;
5027 5027                  }
5028 5028                  return (err);
5029 5029          }
5030 5030          if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5031 5031                  if (state->id_mac_state & IBD_DRV_STARTED) {
5032 5032                          return (EBUSY);
5033 5033                  }
5034 5034                  if (pr_val == NULL) {
5035 5035                          return (EINVAL);
5036 5036                  }
5037 5037                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5038 5038                  if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5039 5039                      result > state->rc_mtu) {
5040 5040                          err = EINVAL;
5041 5041                  } else {
5042 5042                          state->id_rc_tx_copy_thresh = (uint32_t)result;
5043 5043                  }
5044 5044                  return (err);
5045 5045          }
5046 5046          if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5047 5047                  if (state->id_mac_state & IBD_DRV_STARTED) {
5048 5048                          return (EBUSY);
5049 5049                  }
5050 5050                  if (pr_val == NULL) {
5051 5051                          return (EINVAL);
5052 5052                  }
5053 5053                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5054 5054                  if (result < IBD_MIN_UD_NUM_RWQE ||
5055 5055                      result > IBD_MAX_UD_NUM_RWQE) {
5056 5056                          err = EINVAL;
5057 5057                  } else {
5058 5058                          if (result > state->id_hca_max_chan_sz) {
5059 5059                                  state->id_ud_num_rwqe =
5060 5060                                      state->id_hca_max_chan_sz;
5061 5061                          } else {
5062 5062                                  state->id_ud_num_rwqe = (uint32_t)result;
5063 5063                          }
5064 5064                          if (state->id_allow_coalesce_comp_tuning &&
5065 5065                              state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5066 5066                                  state->id_ud_rx_comp_count =
5067 5067                                      state->id_ud_num_rwqe;
5068 5068                  }
5069 5069                  return (err);
5070 5070          }
5071 5071          if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5072 5072                  if (state->id_mac_state & IBD_DRV_STARTED) {
5073 5073                          return (EBUSY);
5074 5074                  }
5075 5075                  if (pr_val == NULL) {
5076 5076                          return (EINVAL);
5077 5077                  }
5078 5078                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5079 5079                  if (result < IBD_MIN_UD_NUM_SWQE ||
5080 5080                      result > IBD_MAX_UD_NUM_SWQE) {
5081 5081                          err = EINVAL;
5082 5082                  } else {
5083 5083                          if (result > state->id_hca_max_chan_sz) {
5084 5084                                  state->id_ud_num_swqe =
5085 5085                                      state->id_hca_max_chan_sz;
5086 5086                          } else {
5087 5087                                  state->id_ud_num_swqe = (uint32_t)result;
5088 5088                          }
5089 5089                          if (state->id_allow_coalesce_comp_tuning &&
5090 5090                              state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5091 5091                                  state->id_ud_tx_comp_count =
5092 5092                                      state->id_ud_num_swqe;
5093 5093                  }
5094 5094                  return (err);
5095 5095          }
5096 5096          if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5097 5097                  if (!state->id_allow_coalesce_comp_tuning) {
5098 5098                          return (ENOTSUP);
5099 5099                  }
5100 5100                  if (pr_val == NULL) {
5101 5101                          return (EINVAL);
5102 5102                  }
5103 5103                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5104 5104                  if (result < 1 || result > state->id_ud_num_rwqe) {
5105 5105                          err = EINVAL;
5106 5106                  } else {
5107 5107                          state->id_ud_rx_comp_count = (uint32_t)result;
5108 5108                  }
5109 5109                  return (err);
5110 5110          }
5111 5111          if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5112 5112                  if (!state->id_allow_coalesce_comp_tuning) {
5113 5113                          return (ENOTSUP);
5114 5114                  }
5115 5115                  if (pr_val == NULL) {
5116 5116                          return (EINVAL);
5117 5117                  }
5118 5118                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5119 5119                  if (result < 1) {
5120 5120                          err = EINVAL;
5121 5121                  } else {
5122 5122                          state->id_ud_rx_comp_usec = (uint32_t)result;
5123 5123                  }
5124 5124                  return (err);
5125 5125          }
5126 5126          if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5127 5127                  if (!state->id_allow_coalesce_comp_tuning) {
5128 5128                          return (ENOTSUP);
5129 5129                  }
5130 5130                  if (pr_val == NULL) {
5131 5131                          return (EINVAL);
5132 5132                  }
5133 5133                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5134 5134                  if (result < 1 || result > state->id_ud_num_swqe) {
5135 5135                          err = EINVAL;
5136 5136                  } else {
5137 5137                          state->id_ud_tx_comp_count = (uint32_t)result;
5138 5138                  }
5139 5139                  return (err);
5140 5140          }
5141 5141          if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5142 5142                  if (!state->id_allow_coalesce_comp_tuning) {
5143 5143                          return (ENOTSUP);
5144 5144                  }
5145 5145                  if (pr_val == NULL) {
5146 5146                          return (EINVAL);
5147 5147                  }
5148 5148                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5149 5149                  if (result < 1) {
5150 5150                          err = EINVAL;
5151 5151                  } else {
5152 5152                          state->id_ud_tx_comp_usec = (uint32_t)result;
5153 5153                  }
5154 5154                  return (err);
5155 5155          }
5156 5156          if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5157 5157                  if (state->id_mac_state & IBD_DRV_STARTED) {
5158 5158                          return (EBUSY);
5159 5159                  }
5160 5160                  if (pr_val == NULL) {
5161 5161                          return (EINVAL);
5162 5162                  }
5163 5163                  (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5164 5164                  if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5165 5165                      result > IBD_MAX_UD_TX_COPY_THRESH) {
5166 5166                          err = EINVAL;
5167 5167                  } else {
5168 5168                          state->id_ud_tx_copy_thresh = (uint32_t)result;
5169 5169                  }
5170 5170                  return (err);
5171 5171          }
5172 5172          return (ENOTSUP);
5173 5173  }
5174 5174  
5175 5175  static int
5176 5176  ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5177 5177      void *pr_val)
5178 5178  {
5179 5179          int err = ENOTSUP;
5180 5180          int value;
5181 5181  
5182 5182          if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5183 5183                  value = state->id_bgroup_present;
5184 5184                  err = 0;
5185 5185                  goto done;
5186 5186          }
5187 5187          if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5188 5188                  value = state->id_allow_coalesce_comp_tuning;
5189 5189                  err = 0;
5190 5190                  goto done;
5191 5191          }
5192 5192          if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5193 5193                  value = state->id_create_broadcast_group;
5194 5194                  err = 0;
5195 5195                  goto done;
5196 5196          }
5197 5197          if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5198 5198                  value = state->id_hash_size;
5199 5199                  err = 0;
5200 5200                  goto done;
5201 5201          }
5202 5202          if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5203 5203                  value = state->id_lso_policy;
5204 5204                  err = 0;
5205 5205                  goto done;
5206 5206          }
5207 5207          if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5208 5208                  value = state->id_num_ah;
5209 5209                  err = 0;
5210 5210                  goto done;
5211 5211          }
5212 5212          if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5213 5213                  value = state->id_num_lso_bufs;
5214 5214                  err = 0;
5215 5215                  goto done;
5216 5216          }
5217 5217          if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5218 5218                  value = state->rc_enable_srq;
5219 5219                  err = 0;
5220 5220                  goto done;
5221 5221          }
5222 5222          if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5223 5223                  value = state->id_rc_num_rwqe;
5224 5224                  err = 0;
5225 5225                  goto done;
5226 5226          }
5227 5227          if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5228 5228                  value = state->id_rc_num_srq;
5229 5229                  err = 0;
5230 5230                  goto done;
5231 5231          }
5232 5232          if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5233 5233                  value = state->id_rc_num_swqe;
5234 5234                  err = 0;
5235 5235                  goto done;
5236 5236          }
5237 5237          if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5238 5238                  value = state->id_rc_rx_comp_count;
5239 5239                  err = 0;
5240 5240                  goto done;
5241 5241          }
5242 5242          if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5243 5243                  value = state->id_rc_rx_comp_usec;
5244 5244                  err = 0;
5245 5245                  goto done;
5246 5246          }
5247 5247          if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5248 5248                  value = state->id_rc_rx_copy_thresh;
5249 5249                  err = 0;
5250 5250                  goto done;
5251 5251          }
5252 5252          if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5253 5253                  value = state->id_rc_rx_rwqe_thresh;
5254 5254                  err = 0;
5255 5255                  goto done;
5256 5256          }
5257 5257          if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5258 5258                  value = state->id_rc_tx_comp_count;
5259 5259                  err = 0;
5260 5260                  goto done;
5261 5261          }
5262 5262          if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5263 5263                  value = state->id_rc_tx_comp_usec;
5264 5264                  err = 0;
5265 5265                  goto done;
5266 5266          }
5267 5267          if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5268 5268                  value = state->id_rc_tx_copy_thresh;
5269 5269                  err = 0;
5270 5270                  goto done;
5271 5271          }
5272 5272          if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5273 5273                  value = state->id_ud_num_rwqe;
5274 5274                  err = 0;
5275 5275                  goto done;
5276 5276          }
5277 5277          if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5278 5278                  value = state->id_ud_num_swqe;
5279 5279                  err = 0;
5280 5280                  goto done;
5281 5281          }
5282 5282          if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5283 5283                  value = state->id_ud_rx_comp_count;
5284 5284                  err = 0;
5285 5285                  goto done;
5286 5286          }
5287 5287          if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5288 5288                  value = state->id_ud_rx_comp_usec;
5289 5289                  err = 0;
5290 5290                  goto done;
5291 5291          }
5292 5292          if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5293 5293                  value = state->id_ud_tx_comp_count;
5294 5294                  err = 0;
5295 5295                  goto done;
5296 5296          }
5297 5297          if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5298 5298                  value = state->id_ud_tx_comp_usec;
5299 5299                  err = 0;
5300 5300                  goto done;
5301 5301          }
5302 5302          if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5303 5303                  value = state->id_ud_tx_copy_thresh;
5304 5304                  err = 0;
5305 5305                  goto done;
5306 5306          }
5307 5307  done:
5308 5308          if (err == 0) {
5309 5309                  (void) snprintf(pr_val, pr_valsize, "%d", value);
5310 5310          }
5311 5311          return (err);
5312 5312  }
5313 5313  
5314 5314  static int
5315 5315  ibd_get_port_details(ibd_state_t *state)
5316 5316  {
5317 5317          ibt_hca_portinfo_t *port_infop;
5318 5318          ibt_status_t ret;
5319 5319          uint_t psize, port_infosz;
5320 5320  
5321 5321          mutex_enter(&state->id_link_mutex);
5322 5322  
5323 5323          /*
5324 5324           * Query for port information
5325 5325           */
5326 5326          ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5327 5327              &port_infop, &psize, &port_infosz);
5328 5328          if ((ret != IBT_SUCCESS) || (psize != 1)) {
5329 5329                  mutex_exit(&state->id_link_mutex);
5330 5330                  DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5331 5331                      "failed, ret=%d", ret);
5332 5332                  return (ENETDOWN);
5333 5333          }
5334 5334  
5335 5335          /*
5336 5336           * If the link is active, verify the pkey
5337 5337           */
5338 5338          if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5339 5339                  if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5340 5340                      state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5341 5341                          state->id_link_state = LINK_STATE_DOWN;
5342 5342                  } else {
5343 5343                          state->id_link_state = LINK_STATE_UP;
5344 5344                  }
5345 5345                  state->id_mtu = (128 << port_infop->p_mtu);
5346 5346                  _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5347 5347                  state->id_sgid = *port_infop->p_sgid_tbl;
5348 5348                  _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5349 5349                  /*
5350 5350                   * Now that the port is active, record the port speed
5351 5351                   */
5352 5352                  state->id_link_speed = ibd_get_portspeed(state);
5353 5353          } else {
5354 5354                  /* Make sure that these are handled in PORT_UP/CHANGE */
5355 5355                  state->id_mtu = 0;
5356 5356                  state->id_link_state = LINK_STATE_DOWN;
5357 5357                  state->id_link_speed = 0;
5358 5358          }
5359 5359          mutex_exit(&state->id_link_mutex);
5360 5360          ibt_free_portinfo(port_infop, port_infosz);
5361 5361  
5362 5362          return (0);
5363 5363  }
5364 5364  
5365 5365  static int
5366 5366  ibd_alloc_cqs(ibd_state_t *state)
5367 5367  {
5368 5368          ibt_hca_attr_t hca_attrs;
5369 5369          ibt_cq_attr_t cq_attr;
5370 5370          ibt_status_t ret;
5371 5371          uint32_t real_size;
5372 5372          uint_t num_rwqe_change = 0;
5373 5373          uint_t num_swqe_change = 0;
5374 5374  
5375 5375          ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5376 5376          ASSERT(ret == IBT_SUCCESS);
5377 5377  
5378 5378          /*
5379 5379           * Allocate Rx/combined CQ:
5380 5380           * Theoretically, there is no point in having more than #rwqe
5381 5381           * plus #swqe cqe's, except that the CQ will be signaled for
5382 5382           * overflow when the last wqe completes, if none of the previous
5383 5383           * cqe's have been polled. Thus, we allocate just a few less wqe's
5384 5384           * to make sure such overflow does not occur.
5385 5385           */
5386 5386          cq_attr.cq_sched = NULL;
5387 5387          cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5388 5388  
5389 5389          /*
5390 5390           * Allocate Receive CQ.
5391 5391           */
5392 5392          if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5393 5393                  cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5394 5394          } else {
5395 5395                  cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5396 5396                  num_rwqe_change = state->id_ud_num_rwqe;
5397 5397                  state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5398 5398          }
5399 5399  
5400 5400          if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5401 5401              &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5402 5402                  DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5403 5403                      "failed, ret=%d\n", ret);
5404 5404                  return (DDI_FAILURE);
5405 5405          }
5406 5406  
5407 5407          if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5408 5408              state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5409 5409                  DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5410 5410                      "moderation failed, ret=%d\n", ret);
5411 5411          }
5412 5412  
5413 5413          /* make the #rx wc's the same as max rx chain size */
5414 5414          state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5415 5415          state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5416 5416              state->id_rxwcs_size, KM_SLEEP);
5417 5417  
5418 5418          /*
5419 5419           * Allocate Send CQ.
5420 5420           */
5421 5421          if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5422 5422                  cq_attr.cq_size = state->id_ud_num_swqe + 1;
5423 5423          } else {
5424 5424                  cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5425 5425                  num_swqe_change = state->id_ud_num_swqe;
5426 5426                  state->id_ud_num_swqe = cq_attr.cq_size - 1;
5427 5427          }
5428 5428  
5429 5429          if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5430 5430              &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5431 5431                  DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5432 5432                      "failed, ret=%d\n", ret);
5433 5433                  kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5434 5434                      state->id_rxwcs_size);
5435 5435                  (void) ibt_free_cq(state->id_rcq_hdl);
5436 5436                  return (DDI_FAILURE);
5437 5437          }
5438 5438          if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5439 5439              state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5440 5440                  DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5441 5441                      "moderation failed, ret=%d\n", ret);
5442 5442          }
5443 5443  
5444 5444          state->id_txwcs_size = IBD_TX_POLL_THRESH;
5445 5445          state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5446 5446              state->id_txwcs_size, KM_SLEEP);
5447 5447  
5448 5448          /*
5449 5449           * Print message in case we could not allocate as many wqe's
5450 5450           * as was requested.
5451 5451           */
5452 5452          if (num_rwqe_change) {
5453 5453                  ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5454 5454                      "%d", state->id_ud_num_rwqe, num_rwqe_change);
5455 5455          }
5456 5456          if (num_swqe_change) {
5457 5457                  ibd_print_warn(state, "Setting #swqe = %d instead of default "
5458 5458                      "%d", state->id_ud_num_swqe, num_swqe_change);
5459 5459          }
5460 5460  
5461 5461          return (DDI_SUCCESS);
5462 5462  }
5463 5463  
5464 5464  static int
5465 5465  ibd_setup_ud_channel(ibd_state_t *state)
5466 5466  {
5467 5467          ibt_ud_chan_alloc_args_t ud_alloc_attr;
5468 5468          ibt_ud_chan_query_attr_t ud_chan_attr;
5469 5469          ibt_status_t ret;
5470 5470  
5471 5471          ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5472 5472          if (state->id_hca_res_lkey_capab)
5473 5473                  ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5474 5474          if (state->id_lso_policy && state->id_lso_capable)
5475 5475                  ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5476 5476  
5477 5477          ud_alloc_attr.ud_hca_port_num   = state->id_port;
5478 5478          ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5479 5479          ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5480 5480          ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5481 5481          ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5482 5482          ud_alloc_attr.ud_qkey           = state->id_mcinfo->mc_qkey;
5483 5483          ud_alloc_attr.ud_scq            = state->id_scq_hdl;
5484 5484          ud_alloc_attr.ud_rcq            = state->id_rcq_hdl;
5485 5485          ud_alloc_attr.ud_pd             = state->id_pd_hdl;
5486 5486          ud_alloc_attr.ud_pkey_ix        = state->id_pkix;
5487 5487          ud_alloc_attr.ud_clone_chan     = NULL;
5488 5488  
5489 5489          if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5490 5490              &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5491 5491                  DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5492 5492                      "failed, ret=%d\n", ret);
5493 5493                  return (DDI_FAILURE);
5494 5494          }
5495 5495  
5496 5496          if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5497 5497              &ud_chan_attr)) != IBT_SUCCESS) {
5498 5498                  DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5499 5499                      "failed, ret=%d\n", ret);
5500 5500                  (void) ibt_free_channel(state->id_chnl_hdl);
5501 5501                  return (DDI_FAILURE);
5502 5502          }
5503 5503  
5504 5504          state->id_qpnum = ud_chan_attr.ud_qpn;
5505 5505  
5506 5506          return (DDI_SUCCESS);
5507 5507  }
5508 5508  
5509 5509  static int
5510 5510  ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5511 5511  {
5512 5512          uint32_t progress = state->id_mac_state;
5513 5513          uint_t attempts;
5514 5514          ibt_status_t ret;
5515 5515          ib_gid_t mgid;
5516 5516          ibd_mce_t *mce;
5517 5517          uint8_t jstate;
5518 5518          timeout_id_t tid;
5519 5519  
5520 5520          if (atomic_dec_32_nv(&state->id_running) != 0)
5521 5521                  cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5522 5522  
5523 5523          /*
5524 5524           * Before we try to stop/undo whatever we did in ibd_start(),
5525 5525           * we need to mark the link state appropriately to prevent the
5526 5526           * ip layer from using this instance for any new transfers. Note
5527 5527           * that if the original state of the link was "up" when we're
5528 5528           * here, we'll set the final link state to "unknown", to behave
5529 5529           * in the same fashion as other ethernet drivers.
5530 5530           */
5531 5531          mutex_enter(&state->id_link_mutex);
5532 5532          if (cur_link_state == LINK_STATE_DOWN) {
5533 5533                  state->id_link_state = cur_link_state;
5534 5534          } else {
5535 5535                  state->id_link_state = LINK_STATE_UNKNOWN;
5536 5536          }
5537 5537          mutex_exit(&state->id_link_mutex);
5538 5538          bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5539 5539          mac_link_update(state->id_mh, state->id_link_state);
5540 5540  
5541 5541          state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5542 5542          if (progress & IBD_DRV_STARTED) {
5543 5543                  state->id_mac_state &= (~IBD_DRV_STARTED);
5544 5544          }
5545 5545  
5546 5546          if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5547 5547                  state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5548 5548          }
5549 5549  
5550 5550          /* Stop listen under Reliable Connected Mode */
5551 5551          if (progress & IBD_DRV_RC_LISTEN) {
5552 5552                  ASSERT(state->id_enable_rc);
5553 5553                  if (state->rc_listen_hdl != NULL) {
5554 5554                          ibd_rc_stop_listen(state);
5555 5555                  }
5556 5556                  state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5557 5557          }
5558 5558  
5559 5559          /* Stop timeout routine */
5560 5560          if (progress & IBD_DRV_RC_TIMEOUT) {
5561 5561                  ASSERT(state->id_enable_rc);
5562 5562                  mutex_enter(&state->rc_timeout_lock);
5563 5563                  state->rc_timeout_start = B_FALSE;
5564 5564                  tid = state->rc_timeout;
5565 5565                  state->rc_timeout = 0;
5566 5566                  mutex_exit(&state->rc_timeout_lock);
5567 5567                  if (tid != 0)
5568 5568                          (void) untimeout(tid);
5569 5569                  state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5570 5570          }
5571 5571  
5572 5572          if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5573 5573                  attempts = 100;
5574 5574                  while (state->id_ah_op == IBD_OP_ONGOING) {
5575 5575                          /*
5576 5576                           * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5577 5577                           * port is connecting to a remote IPoIB port. Wait for
5578 5578                           * the end of this connecting operation.
5579 5579                           */
5580 5580                          delay(drv_usectohz(100000));
5581 5581                          if (--attempts == 0) {
5582 5582                                  state->rc_stop_connect++;
5583 5583                                  DPRINT(40, "ibd_undo_start: connecting");
5584 5584                                  break;
5585 5585                          }
5586 5586                  }
5587 5587                  mutex_enter(&state->id_sched_lock);
5588 5588                  state->id_sched_needed = 0;
5589 5589                  mutex_exit(&state->id_sched_lock);
5590 5590                  (void) ibd_rc_close_all_chan(state);
5591 5591          }
5592 5592  
5593 5593          /*
5594 5594           * First, stop receive interrupts; this stops the driver from
5595 5595           * handing up buffers to higher layers.  Wait for receive buffers
5596 5596           * to be returned and give up after 1 second.
5597 5597           */
5598 5598          if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5599 5599                  attempts = 10;
5600 5600                  while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5601 5601                      0) > 0) {
5602 5602                          delay(drv_usectohz(100000));
5603 5603                          if (--attempts == 0) {
5604 5604                                  /*
5605 5605                                   * There are pending bufs with the network
5606 5606                                   * layer and we have no choice but to wait
5607 5607                                   * for them to be done with. Reap all the
5608 5608                                   * Tx/Rx completions that were posted since
5609 5609                                   * we turned off the notification and
5610 5610                                   * return failure.
5611 5611                                   */
5612 5612                                  cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5613 5613                                  DPRINT(2, "ibd_undo_start: "
5614 5614                                      "reclaiming failed");
5615 5615                                  break;
5616 5616                          }
5617 5617                  }
5618 5618                  state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5619 5619          }
5620 5620  
5621 5621          if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5622 5622                  ibd_rc_fini_tx_largebuf_list(state);
5623 5623                  state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5624 5624          }
5625 5625  
5626 5626          if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5627 5627                  ASSERT(state->id_enable_rc);
5628 5628                  if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5629 5629                          if (state->id_ah_op == IBD_OP_ONGOING) {
5630 5630                                  delay(drv_usectohz(10000));
5631 5631                                  if (state->id_ah_op == IBD_OP_ONGOING) {
5632 5632                                          /*
5633 5633                                           * "state->id_ah_op == IBD_OP_ONGOING"
5634 5634                                           * means this IPoIB port is connecting
5635 5635                                           * to a remote IPoIB port. We can't
5636 5636                                           * delete SRQ here.
5637 5637                                           */
5638 5638                                          state->rc_stop_connect++;
5639 5639                                          DPRINT(40, "ibd_undo_start: "
5640 5640                                              "connecting");
5641 5641                                  } else {
5642 5642                                          ibd_rc_fini_srq_list(state);
5643 5643                                          state->id_mac_state &=
5644 5644                                              (~IBD_DRV_RC_SRQ_ALLOCD);
5645 5645                                  }
5646 5646                          } else {
5647 5647                                  ibd_rc_fini_srq_list(state);
5648 5648                                  state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5649 5649                          }
5650 5650                  } else {
5651 5651                          DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5652 5652                  }
5653 5653          }
5654 5654  
5655 5655          if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5656 5656                  ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5657 5657  
5658 5658                  mutex_enter(&state->id_trap_lock);
5659 5659                  state->id_trap_stop = B_TRUE;
5660 5660                  while (state->id_trap_inprog > 0)
5661 5661                          cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5662 5662                  mutex_exit(&state->id_trap_lock);
5663 5663  
5664 5664                  state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5665 5665          }
5666 5666  
5667 5667          if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5668 5668                  /*
5669 5669                   * Flushing the channel ensures that all pending WQE's
5670 5670                   * are marked with flush_error and handed to the CQ. It
5671 5671                   * does not guarantee the invocation of the CQ handler.
5672 5672                   * This call is guaranteed to return successfully for
5673 5673                   * UD QPNs.
5674 5674                   */
5675 5675                  if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5676 5676                      IBT_SUCCESS) {
5677 5677                          DPRINT(10, "ibd_undo_start: flush_channel "
5678 5678                              "failed, ret=%d", ret);
5679 5679                  }
5680 5680  
5681 5681                  /*
5682 5682                   * Give some time for the TX CQ handler to process the
5683 5683                   * completions.
5684 5684                   */
5685 5685                  attempts = 10;
5686 5686                  mutex_enter(&state->id_tx_list.dl_mutex);
5687 5687                  mutex_enter(&state->id_tx_rel_list.dl_mutex);
5688 5688                  while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5689 5689                      != state->id_ud_num_swqe) {
5690 5690                          if (--attempts == 0)
5691 5691                                  break;
5692 5692                          mutex_exit(&state->id_tx_rel_list.dl_mutex);
5693 5693                          mutex_exit(&state->id_tx_list.dl_mutex);
5694 5694                          delay(drv_usectohz(100000));
5695 5695                          mutex_enter(&state->id_tx_list.dl_mutex);
5696 5696                          mutex_enter(&state->id_tx_rel_list.dl_mutex);
5697 5697                  }
5698 5698                  ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5699 5699                  if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5700 5700                      state->id_ud_num_swqe) {
5701 5701                          cmn_err(CE_WARN, "tx resources not freed\n");
5702 5702                  }
5703 5703                  mutex_exit(&state->id_tx_rel_list.dl_mutex);
5704 5704                  mutex_exit(&state->id_tx_list.dl_mutex);
5705 5705  
5706 5706                  attempts = 10;
5707 5707                  while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5708 5708                          if (--attempts == 0)
5709 5709                                  break;
5710 5710                          delay(drv_usectohz(100000));
5711 5711                  }
5712 5712                  ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5713 5713                  if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5714 5714                          cmn_err(CE_WARN, "rx resources not freed\n");
5715 5715                  }
5716 5716  
5717 5717                  state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5718 5718          }
5719 5719  
5720 5720          if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5721 5721                  /*
5722 5722                   * Drop all residual full/non membership. This includes full
5723 5723                   * membership to the broadcast group, and any nonmembership
5724 5724                   * acquired during transmits. We do this after the Tx completion
5725 5725                   * handlers are done, since those might result in some late
5726 5726                   * leaves; this also eliminates a potential race with that
5727 5727                   * path wrt the mc full list insert/delete. Trap handling
5728 5728                   * has also been suppressed at this point. Thus, no locks
5729 5729                   * are required while traversing the mc full list.
5730 5730                   */
5731 5731                  DPRINT(2, "ibd_undo_start: clear full cache entries");
5732 5732                  mce = list_head(&state->id_mc_full);
5733 5733                  while (mce != NULL) {
5734 5734                          mgid = mce->mc_info.mc_adds_vect.av_dgid;
5735 5735                          jstate = mce->mc_jstate;
5736 5736                          mce = list_next(&state->id_mc_full, mce);
5737 5737                          ibd_leave_group(state, mgid, jstate);
5738 5738                  }
5739 5739                  state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5740 5740          }
5741 5741  
5742 5742          if (progress & IBD_DRV_RXLIST_ALLOCD) {
5743 5743                  ibd_fini_rxlist(state);
5744 5744                  state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5745 5745          }
5746 5746  
5747 5747          if (progress & IBD_DRV_TXLIST_ALLOCD) {
5748 5748                  ibd_fini_txlist(state);
5749 5749                  state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5750 5750          }
5751 5751  
5752 5752          if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5753 5753                  if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5754 5754                      IBT_SUCCESS) {
5755 5755                          DPRINT(10, "ibd_undo_start: free_channel "
5756 5756                              "failed, ret=%d", ret);
5757 5757                  }
5758 5758  
5759 5759                  state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5760 5760          }
5761 5761  
5762 5762          if (progress & IBD_DRV_CQS_ALLOCD) {
5763 5763                  kmem_free(state->id_txwcs,
5764 5764                      sizeof (ibt_wc_t) * state->id_txwcs_size);
5765 5765                  if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5766 5766                      IBT_SUCCESS) {
5767 5767                          DPRINT(10, "ibd_undo_start: free_cq(scq) "
5768 5768                              "failed, ret=%d", ret);
5769 5769                  }
5770 5770  
5771 5771                  kmem_free(state->id_rxwcs,
5772 5772                      sizeof (ibt_wc_t) * state->id_rxwcs_size);
5773 5773                  if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5774 5774                          DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5775 5775                              "ret=%d", ret);
5776 5776                  }
5777 5777  
5778 5778                  state->id_txwcs = NULL;
5779 5779                  state->id_rxwcs = NULL;
5780 5780                  state->id_scq_hdl = NULL;
5781 5781                  state->id_rcq_hdl = NULL;
5782 5782  
5783 5783                  state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5784 5784          }
5785 5785  
5786 5786          if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5787 5787                  mutex_enter(&state->id_ac_mutex);
5788 5788                  mod_hash_destroy_hash(state->id_ah_active_hash);
5789 5789                  mutex_exit(&state->id_ac_mutex);
5790 5790                  ibd_acache_fini(state);
5791 5791  
5792 5792                  state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5793 5793          }
5794 5794  
5795 5795          if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5796 5796                  /*
5797 5797                   * If we'd created the ipoib broadcast group and had
5798 5798                   * successfully joined it, leave it now
5799 5799                   */
5800 5800                  if (state->id_bgroup_created) {
5801 5801                          mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5802 5802                          jstate = IB_MC_JSTATE_FULL;
5803 5803                          (void) ibt_leave_mcg(state->id_sgid, mgid,
5804 5804                              state->id_sgid, jstate);
5805 5805                  }
5806 5806                  ibt_free_mcg_info(state->id_mcinfo, 1);
5807 5807  
5808 5808                  state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5809 5809          }
5810 5810  
5811 5811          return (DDI_SUCCESS);
5812 5812  }
5813 5813  
5814 5814  /*
5815 5815   * These pair of routines are used to set/clear the condition that
5816 5816   * the caller is likely to do something to change the id_mac_state.
5817 5817   * If there's already someone doing either a start or a stop (possibly
5818 5818   * due to the async handler detecting a pkey relocation event, a plumb
5819 5819   * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5820 5820   * that's done.
5821 5821   */
5822 5822  static void
5823 5823  ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5824 5824  {
5825 5825          mutex_enter(&state->id_macst_lock);
5826 5826          while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5827 5827                  cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5828 5828  
5829 5829          state->id_mac_state |= flag;
5830 5830          mutex_exit(&state->id_macst_lock);
5831 5831  }
5832 5832  
5833 5833  static void
5834 5834  ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5835 5835  {
5836 5836          mutex_enter(&state->id_macst_lock);
5837 5837          state->id_mac_state &= (~flag);
5838 5838          cv_signal(&state->id_macst_cv);
5839 5839          mutex_exit(&state->id_macst_lock);
5840 5840  }
5841 5841  
5842 5842  /*
5843 5843   * GLDv3 entry point to start hardware.
5844 5844   */
5845 5845  /*ARGSUSED*/
5846 5846  static int
5847 5847  ibd_m_start(void *arg)
5848 5848  {
5849 5849          ibd_state_t *state = arg;
5850 5850          int     ret;
5851 5851  
5852 5852          if (state->id_type == IBD_PORT_DRIVER)
5853 5853                  return (EINVAL);
5854 5854  
5855 5855          ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5856 5856          if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5857 5857                  ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5858 5858                  return (EIO);
5859 5859          }
5860 5860  
5861 5861          ret = ibd_start(state);
5862 5862          ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5863 5863          return (ret);
5864 5864  }
5865 5865  
5866 5866  static int
5867 5867  ibd_start(ibd_state_t *state)
5868 5868  {
5869 5869          int err;
5870 5870          ibt_status_t ret;
5871 5871          int late_hca_init = 0;
5872 5872  
5873 5873          if (state->id_mac_state & IBD_DRV_STARTED)
5874 5874                  return (DDI_SUCCESS);
5875 5875  
5876 5876          /*
5877 5877           * We do not increment the running flag when calling ibd_start() as
5878 5878           * a result of some event which moves the state away from late HCA
5879 5879           * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5880 5880           */
5881 5881          if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5882 5882              (atomic_inc_32_nv(&state->id_running) != 1)) {
5883 5883                  DPRINT(10, "ibd_start: id_running is non-zero");
5884 5884                  cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5885 5885                  atomic_dec_32(&state->id_running);
5886 5886                  return (EINVAL);
5887 5887          }
5888 5888  
5889 5889          /*
5890 5890           * Get port details; if we fail here, something bad happened.
5891 5891           * Fail plumb.
5892 5892           */
5893 5893          if ((err = ibd_get_port_details(state)) != 0) {
5894 5894                  DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5895 5895                  goto start_fail;
5896 5896          }
5897 5897          /*
5898 5898           * If state->id_link_state is DOWN, it indicates that either the port
5899 5899           * is down, or the pkey is not available. In both cases, resort to late
5900 5900           * initialization. Register for subnet notices, and return success.
5901 5901           */
5902 5902          state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5903 5903          if (state->id_link_state == LINK_STATE_DOWN) {
5904 5904                  late_hca_init = 1;
5905 5905                  goto late_hca_init_return;
5906 5906          }
5907 5907  
5908 5908          /*
5909 5909           * Find the IPoIB broadcast group
5910 5910           */
5911 5911          if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5912 5912                  /* Resort to late initialization */
5913 5913                  late_hca_init = 1;
5914 5914                  goto reg_snet_notices;
5915 5915          }
5916 5916          state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5917 5917  
5918 5918          /*
5919 5919           * Initialize per-interface caches and lists; if we fail here,
5920 5920           * it is most likely due to a lack of resources
5921 5921           */
5922 5922          if (ibd_acache_init(state) != DDI_SUCCESS) {
5923 5923                  DPRINT(10, "ibd_start: ibd_acache_init() failed");
5924 5924                  err = ENOMEM;
5925 5925                  goto start_fail;
5926 5926          }
5927 5927          state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5928 5928  
5929 5929          /*
5930 5930           * Allocate send and receive completion queues
5931 5931           */
5932 5932          if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5933 5933                  DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5934 5934                  err = ENOMEM;
5935 5935                  goto start_fail;
5936 5936          }
5937 5937          state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5938 5938  
5939 5939          /*
5940 5940           * Setup a UD channel
5941 5941           */
5942 5942          if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5943 5943                  err = ENOMEM;
5944 5944                  DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5945 5945                  goto start_fail;
5946 5946          }
5947 5947          state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5948 5948  
5949 5949          /*
5950 5950           * Allocate and initialize the tx buffer list
5951 5951           */
5952 5952          if (ibd_init_txlist(state) != DDI_SUCCESS) {
5953 5953                  DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5954 5954                  err = ENOMEM;
5955 5955                  goto start_fail;
5956 5956          }
5957 5957          state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5958 5958  
5959 5959          /*
5960 5960           * Create the send cq handler here
5961 5961           */
5962 5962          ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5963 5963          if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5964 5964              IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5965 5965                  DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5966 5966                      "failed, ret=%d", ret);
5967 5967                  err = EINVAL;
5968 5968                  goto start_fail;
5969 5969          }
5970 5970          state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5971 5971  
5972 5972          /*
5973 5973           * Allocate and initialize the rx buffer list
5974 5974           */
5975 5975          if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5976 5976                  DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5977 5977                  err = ENOMEM;
5978 5978                  goto start_fail;
5979 5979          }
5980 5980          state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5981 5981  
5982 5982          /*
5983 5983           * Join IPoIB broadcast group
5984 5984           */
5985 5985          if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5986 5986                  DPRINT(10, "ibd_start: ibd_join_group() failed");
5987 5987                  err = ENOTACTIVE;
5988 5988                  goto start_fail;
5989 5989          }
5990 5990          state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5991 5991  
5992 5992          /*
5993 5993           * When we did mac_register() in ibd_attach(), we didn't register
5994 5994           * the real macaddr and we didn't have the true port mtu. Now that
5995 5995           * we're almost ready, set the local mac address and broadcast
5996 5996           * addresses and update gldv3 about the real values of these
5997 5997           * parameters.
5998 5998           */
5999 5999          if (state->id_enable_rc) {
6000 6000                  ibd_h2n_mac(&state->id_macaddr,
6001 6001                      IBD_MAC_ADDR_RC + state->id_qpnum,
6002 6002                      state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6003 6003                  ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
6004 6004                      state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6005 6005          } else {
6006 6006                  ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
6007 6007                      state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6008 6008          }
6009 6009          ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
6010 6010              state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
6011 6011  
6012 6012          if (!state->id_enable_rc) {
6013 6013                  (void) mac_maxsdu_update2(state->id_mh,
6014 6014                      state->id_mtu - IPOIB_HDRSIZE,
6015 6015                      state->id_mtu - IPOIB_HDRSIZE);
6016 6016          }
6017 6017          mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6018 6018  
6019 6019          /*
6020 6020           * Setup the receive cq handler
6021 6021           */
6022 6022          ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
6023 6023          if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
6024 6024              IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
6025 6025                  DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
6026 6026                      "failed, ret=%d", ret);
6027 6027                  err = EINVAL;
6028 6028                  goto start_fail;
6029 6029          }
6030 6030          state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
6031 6031  
6032 6032  reg_snet_notices:
6033 6033          /*
6034 6034           * In case of normal initialization sequence,
6035 6035           * Setup the subnet notices handler after we've initialized the acache/
6036 6036           * mcache and started the async thread, both of which are required for
6037 6037           * the trap handler to function properly.
6038 6038           *
6039 6039           * Now that the async thread has been started (and we've already done
6040 6040           * a mac_register() during attach so mac_tx_update() can be called
6041 6041           * if necessary without any problem), we can enable the trap handler
6042 6042           * to queue requests to the async thread.
6043 6043           *
6044 6044           * In case of late hca initialization, the subnet notices handler will
6045 6045           * only handle MCG created/deleted event. The action performed as part
6046 6046           * of handling these events is to start the interface. So, the
6047 6047           * acache/mcache initialization is not a necessity in such cases for
6048 6048           * registering the subnet notices handler. Also, if we are in
6049 6049           * ibd_start() as a result of, say, some event handling after entering
6050 6050           * late hca initialization phase no need to register again.
6051 6051           */
6052 6052          if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6053 6053                  ibt_register_subnet_notices(state->id_ibt_hdl,
6054 6054                      ibd_snet_notices_handler, state);
6055 6055                  mutex_enter(&state->id_trap_lock);
6056 6056                  state->id_trap_stop = B_FALSE;
6057 6057                  mutex_exit(&state->id_trap_lock);
6058 6058                  state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6059 6059          }
6060 6060  
6061 6061  late_hca_init_return:
6062 6062          if (late_hca_init == 1) {
6063 6063                  state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6064 6064                  /*
6065 6065                   * In case of late initialization, mark the link state as down,
6066 6066                   * immaterial of the actual link state as reported in the
6067 6067                   * port_info.
6068 6068                   */
6069 6069                  state->id_link_state = LINK_STATE_DOWN;
6070 6070                  mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6071 6071                  mac_link_update(state->id_mh, state->id_link_state);
6072 6072                  return (DDI_SUCCESS);
6073 6073          }
6074 6074  
6075 6075          if (state->id_enable_rc) {
6076 6076                  if (state->rc_enable_srq) {
6077 6077                          if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6078 6078                                  if (ibd_rc_repost_srq_free_list(state) !=
6079 6079                                      IBT_SUCCESS) {
6080 6080                                          err = ENOMEM;
6081 6081                                          goto start_fail;
6082 6082                                  }
6083 6083                          } else {
6084 6084                                  /* Allocate SRQ resource */
6085 6085                                  if (ibd_rc_init_srq_list(state) !=
6086 6086                                      IBT_SUCCESS) {
6087 6087                                          err = ENOMEM;
6088 6088                                          goto start_fail;
6089 6089                                  }
6090 6090                                  state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6091 6091                          }
6092 6092                  }
6093 6093  
6094 6094                  if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6095 6095                          DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6096 6096                              "failed");
6097 6097                          err = ENOMEM;
6098 6098                          goto start_fail;
6099 6099                  }
6100 6100                  state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6101 6101  
6102 6102                  /* RC: begin to listen only after everything is available */
6103 6103                  if (ibd_rc_listen(state) != IBT_SUCCESS) {
6104 6104                          DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6105 6105                          err = EINVAL;
6106 6106                          goto start_fail;
6107 6107                  }
6108 6108                  state->id_mac_state |= IBD_DRV_RC_LISTEN;
6109 6109          }
6110 6110  
6111 6111          /*
6112 6112           * Indicate link status to GLDv3 and higher layers. By default,
6113 6113           * we assume we are in up state (which must have been true at
6114 6114           * least at the time the broadcast mcg's were probed); if there
6115 6115           * were any up/down transitions till the time we come here, the
6116 6116           * async handler will have updated last known state, which we
6117 6117           * use to tell GLDv3. The async handler will not send any
6118 6118           * notifications to GLDv3 till we reach here in the initialization
6119 6119           * sequence.
6120 6120           */
6121 6121          mac_link_update(state->id_mh, state->id_link_state);
6122 6122          state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6123 6123          state->id_mac_state |= IBD_DRV_STARTED;
6124 6124  
6125 6125          /* Start timer after everything is ready */
6126 6126          if (state->id_enable_rc) {
6127 6127                  mutex_enter(&state->rc_timeout_lock);
6128 6128                  state->rc_timeout_start = B_TRUE;
6129 6129                  state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
6130 6130                      SEC_TO_TICK(ibd_rc_conn_timeout));
6131 6131                  mutex_exit(&state->rc_timeout_lock);
6132 6132                  state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
6133 6133          }
6134 6134  
6135 6135          return (DDI_SUCCESS);
6136 6136  
6137 6137  start_fail:
6138 6138          /*
6139 6139           * If we ran into a problem during ibd_start() and ran into
6140 6140           * some other problem during undoing our partial work, we can't
6141 6141           * do anything about it.  Ignore any errors we might get from
6142 6142           * ibd_undo_start() and just return the original error we got.
6143 6143           */
6144 6144          (void) ibd_undo_start(state, LINK_STATE_DOWN);
6145 6145          return (err);
6146 6146  }
6147 6147  
6148 6148  /*
6149 6149   * GLDv3 entry point to stop hardware from receiving packets.
6150 6150   */
6151 6151  /*ARGSUSED*/
6152 6152  static void
6153 6153  ibd_m_stop(void *arg)
6154 6154  {
6155 6155          ibd_state_t *state = (ibd_state_t *)arg;
6156 6156  
6157 6157          if (state->id_type == IBD_PORT_DRIVER)
6158 6158                  return;
6159 6159  
6160 6160          ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6161 6161  
6162 6162          (void) ibd_undo_start(state, state->id_link_state);
6163 6163  
6164 6164          ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6165 6165  }
6166 6166  
6167 6167  /*
6168 6168   * GLDv3 entry point to modify device's mac address. We do not
6169 6169   * allow address modifications.
6170 6170   */
6171 6171  static int
6172 6172  ibd_m_unicst(void *arg, const uint8_t *macaddr)
6173 6173  {
6174 6174          ibd_state_t *state = arg;
6175 6175  
6176 6176          if (state->id_type == IBD_PORT_DRIVER)
6177 6177                  return (EINVAL);
6178 6178  
6179 6179          /*
6180 6180           * Don't bother even comparing the macaddr if we haven't
6181 6181           * completed ibd_m_start().
6182 6182           */
6183 6183          if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6184 6184                  return (0);
6185 6185  
6186 6186          if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6187 6187                  return (0);
6188 6188          else
6189 6189                  return (EINVAL);
6190 6190  }
6191 6191  
6192 6192  /*
6193 6193   * The blocking part of the IBA join/leave operations are done out
6194 6194   * of here on the async thread.
6195 6195   */
6196 6196  static void
6197 6197  ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6198 6198  {
6199 6199          DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6200 6200              "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6201 6201  
6202 6202          if (op == IBD_ASYNC_JOIN) {
6203 6203                  if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6204 6204                          ibd_print_warn(state, "Join multicast group failed :"
6205 6205                          "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6206 6206                  }
6207 6207          } else {
6208 6208                  /*
6209 6209                   * Here, we must search for the proper mcg_info and
6210 6210                   * use that to leave the group.
6211 6211                   */
6212 6212                  ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6213 6213          }
6214 6214  }
6215 6215  
6216 6216  /*
6217 6217   * GLDv3 entry point for multicast enable/disable requests.
6218 6218   * This function queues the operation to the async thread and
6219 6219   * return success for a valid multicast address.
6220 6220   */
6221 6221  static int
6222 6222  ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6223 6223  {
6224 6224          ibd_state_t *state = (ibd_state_t *)arg;
6225 6225          ipoib_mac_t maddr, *mcast;
6226 6226          ib_gid_t mgid;
6227 6227          ibd_req_t *req;
6228 6228  
6229 6229          if (state->id_type == IBD_PORT_DRIVER)
6230 6230                  return (EINVAL);
6231 6231  
6232 6232          /*
6233 6233           * If we haven't completed ibd_m_start(), async thread wouldn't
6234 6234           * have been started and id_bcaddr wouldn't be set, so there's
6235 6235           * no point in continuing.
6236 6236           */
6237 6237          if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6238 6238                  return (0);
6239 6239  
6240 6240          /*
6241 6241           * The incoming multicast address might not be aligned properly
6242 6242           * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6243 6243           * it to look like one though, to get the offsets of the mc gid,
6244 6244           * since we know we are not going to dereference any values with
6245 6245           * the ipoib_mac_t pointer.
6246 6246           */
6247 6247          bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6248 6248          mcast = &maddr;
6249 6249  
6250 6250          /*
6251 6251           * Check validity of MCG address. We could additionally check
6252 6252           * that a enable/disable is not being issued on the "broadcast"
6253 6253           * mcg, but since this operation is only invokable by privileged
6254 6254           * programs anyway, we allow the flexibility to those dlpi apps.
6255 6255           * Note that we do not validate the "scope" of the IBA mcg.
6256 6256           */
6257 6257          if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6258 6258                  return (EINVAL);
6259 6259  
6260 6260          /*
6261 6261           * fill in multicast pkey and scope
6262 6262           */
6263 6263          IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6264 6264  
6265 6265          /*
6266 6266           * If someone is trying to JOIN/LEAVE the broadcast group, we do
6267 6267           * nothing (i.e. we stay JOINed to the broadcast group done in
6268 6268           * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6269 6269           * requires to be joined to broadcast groups at all times.
6270 6270           * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6271 6271           * depends on this.
6272 6272           */
6273 6273          if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6274 6274                  return (0);
6275 6275  
6276 6276          ibd_n2h_gid(mcast, &mgid);
6277 6277          req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6278 6278          if (req == NULL)
6279 6279                  return (ENOMEM);
6280 6280  
6281 6281          req->rq_gid = mgid;
6282 6282  
6283 6283          if (add) {
6284 6284                  DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6285 6285                      mgid.gid_prefix, mgid.gid_guid);
6286 6286                  ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6287 6287          } else {
6288 6288                  DPRINT(1, "ibd_m_multicst : unset_multicast : "
6289 6289                      "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6290 6290                  ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6291 6291          }
6292 6292          return (0);
6293 6293  }
6294 6294  
6295 6295  /*
6296 6296   * The blocking part of the IBA promiscuous operations are done
6297 6297   * out of here on the async thread. The dlpireq parameter indicates
6298 6298   * whether this invocation is due to a dlpi request or due to
6299 6299   * a port up/down event.
6300 6300   */
6301 6301  static void
6302 6302  ibd_async_unsetprom(ibd_state_t *state)
6303 6303  {
6304 6304          ibd_mce_t *mce = list_head(&state->id_mc_non);
6305 6305          ib_gid_t mgid;
6306 6306  
6307 6307          DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6308 6308  
6309 6309          while (mce != NULL) {
6310 6310                  mgid = mce->mc_info.mc_adds_vect.av_dgid;
6311 6311                  mce = list_next(&state->id_mc_non, mce);
6312 6312                  ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6313 6313          }
6314 6314          state->id_prom_op = IBD_OP_NOTSTARTED;
6315 6315  }
6316 6316  
6317 6317  /*
6318 6318   * The blocking part of the IBA promiscuous operations are done
6319 6319   * out of here on the async thread. The dlpireq parameter indicates
6320 6320   * whether this invocation is due to a dlpi request or due to
6321 6321   * a port up/down event.
6322 6322   */
6323 6323  static void
6324 6324  ibd_async_setprom(ibd_state_t *state)
6325 6325  {
6326 6326          ibt_mcg_attr_t mcg_attr;
6327 6327          ibt_mcg_info_t *mcg_info;
6328 6328          ib_gid_t mgid;
6329 6329          uint_t numg;
6330 6330          int i;
6331 6331          char ret = IBD_OP_COMPLETED;
6332 6332  
6333 6333          DPRINT(2, "ibd_async_setprom : async_set_promisc");
6334 6334  
6335 6335          /*
6336 6336           * Obtain all active MC groups on the IB fabric with
6337 6337           * specified criteria (scope + Pkey + Qkey + mtu).
6338 6338           */
6339 6339          bzero(&mcg_attr, sizeof (mcg_attr));
6340 6340          mcg_attr.mc_pkey = state->id_pkey;
6341 6341          mcg_attr.mc_scope = state->id_scope;
6342 6342          mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6343 6343          mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6344 6344          mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6345 6345          if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6346 6346              IBT_SUCCESS) {
6347 6347                  ibd_print_warn(state, "Could not get list of IBA multicast "
6348 6348                      "groups");
6349 6349                  ret = IBD_OP_ERRORED;
6350 6350                  goto done;
6351 6351          }
6352 6352  
6353 6353          /*
6354 6354           * Iterate over the returned mcg's and join as NonMember
6355 6355           * to the IP mcg's.
6356 6356           */
6357 6357          for (i = 0; i < numg; i++) {
6358 6358                  /*
6359 6359                   * Do a NonMember JOIN on the MC group.
6360 6360                   */
6361 6361                  mgid = mcg_info[i].mc_adds_vect.av_dgid;
6362 6362                  if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6363 6363                          ibd_print_warn(state, "IBA promiscuous mode missed "
6364 6364                              "multicast gid %016llx:%016llx",
6365 6365                              (u_longlong_t)mgid.gid_prefix,
6366 6366                              (u_longlong_t)mgid.gid_guid);
6367 6367          }
6368 6368  
6369 6369          ibt_free_mcg_info(mcg_info, numg);
6370 6370          DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6371 6371  done:
6372 6372          state->id_prom_op = ret;
6373 6373  }
6374 6374  
6375 6375  /*
6376 6376   * GLDv3 entry point for multicast promiscuous enable/disable requests.
6377 6377   * GLDv3 assumes phys state receives more packets than multi state,
6378 6378   * which is not true for IPoIB. Thus, treat the multi and phys
6379 6379   * promiscuous states the same way to work with GLDv3's assumption.
6380 6380   */
6381 6381  static int
6382 6382  ibd_m_promisc(void *arg, boolean_t on)
6383 6383  {
6384 6384          ibd_state_t *state = (ibd_state_t *)arg;
6385 6385          ibd_req_t *req;
6386 6386  
6387 6387          if (state->id_type == IBD_PORT_DRIVER)
6388 6388                  return (EINVAL);
6389 6389  
6390 6390          /*
6391 6391           * Async thread wouldn't have been started if we haven't
6392 6392           * passed ibd_m_start()
6393 6393           */
6394 6394          if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6395 6395                  return (0);
6396 6396  
6397 6397          req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6398 6398          if (req == NULL)
6399 6399                  return (ENOMEM);
6400 6400          if (on) {
6401 6401                  DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6402 6402                  ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6403 6403          } else {
6404 6404                  DPRINT(1, "ibd_m_promisc : unset_promisc");
6405 6405                  ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6406 6406          }
6407 6407  
6408 6408          return (0);
6409 6409  }
6410 6410  
6411 6411  /*
6412 6412   * GLDv3 entry point for gathering statistics.
6413 6413   */
6414 6414  static int
6415 6415  ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6416 6416  {
6417 6417          ibd_state_t *state = (ibd_state_t *)arg;
6418 6418  
6419 6419          switch (stat) {
6420 6420          case MAC_STAT_IFSPEED:
6421 6421                  *val = state->id_link_speed;
6422 6422                  break;
6423 6423          case MAC_STAT_MULTIRCV:
6424 6424                  *val = state->id_multi_rcv;
6425 6425                  break;
6426 6426          case MAC_STAT_BRDCSTRCV:
6427 6427                  *val = state->id_brd_rcv;
6428 6428                  break;
6429 6429          case MAC_STAT_MULTIXMT:
6430 6430                  *val = state->id_multi_xmt;
6431 6431                  break;
6432 6432          case MAC_STAT_BRDCSTXMT:
6433 6433                  *val = state->id_brd_xmt;
6434 6434                  break;
6435 6435          case MAC_STAT_RBYTES:
6436 6436                  *val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6437 6437                      + state->rc_rcv_copy_byte;
6438 6438                  break;
6439 6439          case MAC_STAT_IPACKETS:
6440 6440                  *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6441 6441                      + state->rc_rcv_copy_pkt;
6442 6442                  break;
6443 6443          case MAC_STAT_OBYTES:
6444 6444                  *val = state->id_xmt_bytes + state->rc_xmt_bytes;
6445 6445                  break;
6446 6446          case MAC_STAT_OPACKETS:
6447 6447                  *val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6448 6448                      state->rc_xmt_fragmented_pkt +
6449 6449                      state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6450 6450                  break;
6451 6451          case MAC_STAT_OERRORS:
6452 6452                  *val = state->id_ah_error;      /* failed AH translation */
6453 6453                  break;
6454 6454          case MAC_STAT_IERRORS:
6455 6455                  *val = 0;
6456 6456                  break;
6457 6457          case MAC_STAT_NOXMTBUF:
6458 6458                  *val = state->id_tx_short + state->rc_swqe_short +
6459 6459                      state->rc_xmt_buf_short;
6460 6460                  break;
6461 6461          case MAC_STAT_NORCVBUF:
6462 6462          default:
6463 6463                  return (ENOTSUP);
6464 6464          }
6465 6465  
6466 6466          return (0);
6467 6467  }
6468 6468  
6469 6469  static void
6470 6470  ibd_async_txsched(ibd_state_t *state)
6471 6471  {
6472 6472          ibd_resume_transmission(state);
6473 6473  }
6474 6474  
6475 6475  static void
6476 6476  ibd_resume_transmission(ibd_state_t *state)
6477 6477  {
6478 6478          int flag;
6479 6479          int met_thresh = 0;
6480 6480          int thresh = 0;
6481 6481          int ret = -1;
6482 6482  
6483 6483          mutex_enter(&state->id_sched_lock);
6484 6484          if (state->id_sched_needed & IBD_RSRC_SWQE) {
6485 6485                  mutex_enter(&state->id_tx_list.dl_mutex);
6486 6486                  mutex_enter(&state->id_tx_rel_list.dl_mutex);
6487 6487                  met_thresh = state->id_tx_list.dl_cnt +
6488 6488                      state->id_tx_rel_list.dl_cnt;
6489 6489                  mutex_exit(&state->id_tx_rel_list.dl_mutex);
6490 6490                  mutex_exit(&state->id_tx_list.dl_mutex);
6491 6491                  thresh = IBD_FREE_SWQES_THRESH;
6492 6492                  flag = IBD_RSRC_SWQE;
6493 6493          } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6494 6494                  ASSERT(state->id_lso != NULL);
6495 6495                  mutex_enter(&state->id_lso_lock);
6496 6496                  met_thresh = state->id_lso->bkt_nfree;
6497 6497                  thresh = IBD_FREE_LSOS_THRESH;
6498 6498                  mutex_exit(&state->id_lso_lock);
6499 6499                  flag = IBD_RSRC_LSOBUF;
6500 6500                  if (met_thresh > thresh)
6501 6501                          state->id_sched_lso_cnt++;
6502 6502          }
6503 6503          if (met_thresh > thresh) {
6504 6504                  state->id_sched_needed &= ~flag;
6505 6505                  state->id_sched_cnt++;
6506 6506                  ret = 0;
6507 6507          }
6508 6508          mutex_exit(&state->id_sched_lock);
6509 6509  
6510 6510          if (ret == 0)
6511 6511                  mac_tx_update(state->id_mh);
6512 6512  }
6513 6513  
6514 6514  /*
6515 6515   * Release the send wqe back into free list.
6516 6516   */
6517 6517  static void
6518 6518  ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6519 6519  {
6520 6520          /*
6521 6521           * Add back on Tx list for reuse.
6522 6522           */
6523 6523          ASSERT(tail->swqe_next == NULL);
6524 6524          mutex_enter(&state->id_tx_rel_list.dl_mutex);
6525 6525          state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6526 6526          tail->swqe_next = state->id_tx_rel_list.dl_head;
6527 6527          state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6528 6528          state->id_tx_rel_list.dl_cnt += n;
6529 6529          mutex_exit(&state->id_tx_rel_list.dl_mutex);
6530 6530  }
6531 6531  
6532 6532  /*
6533 6533   * Acquire a send wqe from free list.
6534 6534   * Returns error number and send wqe pointer.
6535 6535   */
6536 6536  static ibd_swqe_t *
6537 6537  ibd_acquire_swqe(ibd_state_t *state)
6538 6538  {
6539 6539          ibd_swqe_t *wqe;
6540 6540  
6541 6541          mutex_enter(&state->id_tx_rel_list.dl_mutex);
6542 6542          if (state->id_tx_rel_list.dl_head != NULL) {
6543 6543                  /* transfer id_tx_rel_list to id_tx_list */
6544 6544                  state->id_tx_list.dl_head =
6545 6545                      state->id_tx_rel_list.dl_head;
6546 6546                  state->id_tx_list.dl_cnt =
6547 6547                      state->id_tx_rel_list.dl_cnt;
6548 6548                  state->id_tx_list.dl_pending_sends = B_FALSE;
6549 6549  
6550 6550                  /* clear id_tx_rel_list */
6551 6551                  state->id_tx_rel_list.dl_head = NULL;
6552 6552                  state->id_tx_rel_list.dl_cnt = 0;
6553 6553                  mutex_exit(&state->id_tx_rel_list.dl_mutex);
6554 6554  
6555 6555                  wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6556 6556                  state->id_tx_list.dl_cnt -= 1;
6557 6557                  state->id_tx_list.dl_head = wqe->swqe_next;
6558 6558          } else {        /* no free swqe */
6559 6559                  mutex_exit(&state->id_tx_rel_list.dl_mutex);
6560 6560                  state->id_tx_list.dl_pending_sends = B_TRUE;
6561 6561                  DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6562 6562                  state->id_tx_short++;
6563 6563                  wqe = NULL;
6564 6564          }
6565 6565          return (wqe);
6566 6566  }
6567 6567  
6568 6568  static int
6569 6569  ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6570 6570      ibt_ud_dest_hdl_t ud_dest)
6571 6571  {
6572 6572          mblk_t  *nmp;
6573 6573          int iph_len, tcph_len;
6574 6574          ibt_wr_lso_t *lso;
6575 6575          uintptr_t ip_start, tcp_start;
6576 6576          uint8_t *dst;
6577 6577          uint_t pending, mblen;
6578 6578  
6579 6579          /*
6580 6580           * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6581 6581           * we need to adjust it here for lso.
6582 6582           */
6583 6583          lso = &(node->w_swr.wr.ud_lso);
6584 6584          lso->lso_ud_dest = ud_dest;
6585 6585          lso->lso_mss = mss;
6586 6586  
6587 6587          /*
6588 6588           * Calculate the LSO header size and set it in the UD LSO structure.
6589 6589           * Note that the only assumption we make is that each of the IPoIB,
6590 6590           * IP and TCP headers will be contained in a single mblk fragment;
6591 6591           * together, the headers may span multiple mblk fragments.
6592 6592           */
6593 6593          nmp = mp;
6594 6594          ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6595 6595          if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6596 6596                  ip_start = (uintptr_t)nmp->b_cont->b_rptr
6597 6597                      + (ip_start - (uintptr_t)(nmp->b_wptr));
6598 6598                  nmp = nmp->b_cont;
6599 6599  
6600 6600          }
6601 6601          iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6602 6602  
6603 6603          tcp_start = ip_start + iph_len;
6604 6604          if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6605 6605                  tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6606 6606                      + (tcp_start - (uintptr_t)(nmp->b_wptr));
6607 6607                  nmp = nmp->b_cont;
6608 6608          }
6609 6609          tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6610 6610          lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6611 6611  
6612 6612          /*
6613 6613           * If the lso header fits entirely within a single mblk fragment,
6614 6614           * we'll avoid an additional copy of the lso header here and just
6615 6615           * pass the b_rptr of the mblk directly.
6616 6616           *
6617 6617           * If this isn't true, we'd have to allocate for it explicitly.
6618 6618           */
6619 6619          if (lso->lso_hdr_sz <= MBLKL(mp)) {
6620 6620                  lso->lso_hdr = mp->b_rptr;
6621 6621          } else {
6622 6622                  /* On work completion, remember to free this allocated hdr */
6623 6623                  lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6624 6624                  if (lso->lso_hdr == NULL) {
6625 6625                          DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6626 6626                              "sz = %d", lso->lso_hdr_sz);
6627 6627                          lso->lso_hdr_sz = 0;
6628 6628                          lso->lso_mss = 0;
6629 6629                          return (-1);
6630 6630                  }
6631 6631          }
6632 6632  
6633 6633          /*
6634 6634           * Copy in the lso header only if we need to
6635 6635           */
6636 6636          if (lso->lso_hdr != mp->b_rptr) {
6637 6637                  dst = lso->lso_hdr;
6638 6638                  pending = lso->lso_hdr_sz;
6639 6639  
6640 6640                  for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6641 6641                          mblen = MBLKL(nmp);
6642 6642                          if (pending > mblen) {
6643 6643                                  bcopy(nmp->b_rptr, dst, mblen);
6644 6644                                  dst += mblen;
6645 6645                                  pending -= mblen;
6646 6646                          } else {
6647 6647                                  bcopy(nmp->b_rptr, dst, pending);
6648 6648                                  break;
6649 6649                          }
6650 6650                  }
6651 6651          }
6652 6652  
6653 6653          return (0);
6654 6654  }
6655 6655  
6656 6656  static void
6657 6657  ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6658 6658  {
6659 6659          ibt_wr_lso_t *lso;
6660 6660  
6661 6661          if ((!node) || (!mp))
6662 6662                  return;
6663 6663  
6664 6664          /*
6665 6665           * Free any header space that we might've allocated if we
6666 6666           * did an LSO
6667 6667           */
6668 6668          if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6669 6669                  lso = &(node->w_swr.wr.ud_lso);
6670 6670                  if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6671 6671                          kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6672 6672                          lso->lso_hdr = NULL;
6673 6673                          lso->lso_hdr_sz = 0;
6674 6674                  }
6675 6675          }
6676 6676  }
6677 6677  
6678 6678  static void
6679 6679  ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6680 6680  {
6681 6681          uint_t          i;
6682 6682          uint_t          num_posted;
6683 6683          uint_t          n_wrs;
6684 6684          ibt_status_t    ibt_status;
6685 6685          ibt_send_wr_t   wrs[IBD_MAX_TX_POST_MULTIPLE];
6686 6686          ibd_swqe_t      *tx_head, *elem;
6687 6687          ibd_swqe_t      *nodes[IBD_MAX_TX_POST_MULTIPLE];
6688 6688  
6689 6689          /* post the one request, then check for more */
6690 6690          ibt_status = ibt_post_send(state->id_chnl_hdl,
6691 6691              &node->w_swr, 1, NULL);
6692 6692          if (ibt_status != IBT_SUCCESS) {
6693 6693                  ibd_print_warn(state, "ibd_post_send: "
6694 6694                      "posting one wr failed: ret=%d", ibt_status);
6695 6695                  ibd_tx_cleanup(state, node);
6696 6696          }
6697 6697  
6698 6698          tx_head = NULL;
6699 6699          for (;;) {
6700 6700                  if (tx_head == NULL) {
6701 6701                          mutex_enter(&state->id_txpost_lock);
6702 6702                          tx_head = state->id_tx_head;
6703 6703                          if (tx_head == NULL) {
6704 6704                                  state->id_tx_busy = 0;
6705 6705                                  mutex_exit(&state->id_txpost_lock);
6706 6706                                  return;
6707 6707                          }
6708 6708                          state->id_tx_head = NULL;
6709 6709                          mutex_exit(&state->id_txpost_lock);
6710 6710                  }
6711 6711  
6712 6712                  /*
6713 6713                   * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6714 6714                   * at a time if possible, and keep posting them.
6715 6715                   */
6716 6716                  for (n_wrs = 0, elem = tx_head;
6717 6717                      (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6718 6718                      elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6719 6719                          nodes[n_wrs] = elem;
6720 6720                          wrs[n_wrs] = elem->w_swr;
6721 6721                  }
6722 6722                  tx_head = elem;
6723 6723  
6724 6724                  ASSERT(n_wrs != 0);
6725 6725  
6726 6726                  /*
6727 6727                   * If posting fails for some reason, we'll never receive
6728 6728                   * completion intimation, so we'll need to cleanup. But
6729 6729                   * we need to make sure we don't clean up nodes whose
6730 6730                   * wrs have been successfully posted. We assume that the
6731 6731                   * hca driver returns on the first failure to post and
6732 6732                   * therefore the first 'num_posted' entries don't need
6733 6733                   * cleanup here.
6734 6734                   */
6735 6735                  num_posted = 0;
6736 6736                  ibt_status = ibt_post_send(state->id_chnl_hdl,
6737 6737                      wrs, n_wrs, &num_posted);
6738 6738                  if (ibt_status != IBT_SUCCESS) {
6739 6739                          ibd_print_warn(state, "ibd_post_send: "
6740 6740                              "posting multiple wrs failed: "
6741 6741                              "requested=%d, done=%d, ret=%d",
6742 6742                              n_wrs, num_posted, ibt_status);
6743 6743  
6744 6744                          for (i = num_posted; i < n_wrs; i++)
6745 6745                                  ibd_tx_cleanup(state, nodes[i]);
6746 6746                  }
6747 6747          }
6748 6748  }
6749 6749  
6750 6750  static int
6751 6751  ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6752 6752      uint_t lsohdr_sz)
6753 6753  {
6754 6754          ibt_wr_ds_t *sgl;
6755 6755          ibt_status_t ibt_status;
6756 6756          mblk_t *nmp;
6757 6757          mblk_t *data_mp;
6758 6758          uchar_t *bufp;
6759 6759          size_t blksize;
6760 6760          size_t skip;
6761 6761          size_t avail;
6762 6762          uint_t pktsize;
6763 6763          uint_t frag_len;
6764 6764          uint_t pending_hdr;
6765 6765          int nmblks;
6766 6766          int i;
6767 6767  
6768 6768          /*
6769 6769           * Let's skip ahead to the data if this is LSO
6770 6770           */
6771 6771          data_mp = mp;
6772 6772          pending_hdr = 0;
6773 6773          if (lsohdr_sz) {
6774 6774                  pending_hdr = lsohdr_sz;
6775 6775                  for (nmp = mp; nmp; nmp = nmp->b_cont) {
6776 6776                          frag_len = nmp->b_wptr - nmp->b_rptr;
6777 6777                          if (frag_len > pending_hdr)
6778 6778                                  break;
6779 6779                          pending_hdr -= frag_len;
6780 6780                  }
6781 6781                  data_mp = nmp;  /* start of data past lso header */
6782 6782                  ASSERT(data_mp != NULL);
6783 6783          }
6784 6784  
6785 6785          /*
6786 6786           * Calculate the size of message data and number of msg blocks
6787 6787           */
6788 6788          pktsize = 0;
6789 6789          for (nmblks = 0, nmp = data_mp; nmp != NULL;
6790 6790              nmp = nmp->b_cont, nmblks++) {
6791 6791                  pktsize += MBLKL(nmp);
6792 6792          }
6793 6793          pktsize -= pending_hdr;
6794 6794  
6795 6795          /*
6796 6796           * We only do ibt_map_mem_iov() if the pktsize is above the
6797 6797           * "copy-threshold", and if the number of mp fragments is less than
6798 6798           * the maximum acceptable.
6799 6799           */
6800 6800          if ((state->id_hca_res_lkey_capab) &&
6801 6801              (pktsize > state->id_ud_tx_copy_thresh) &&
6802 6802              (nmblks < state->id_max_sqseg_hiwm)) {
6803 6803                  ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6804 6804                  ibt_iov_attr_t iov_attr;
6805 6805  
6806 6806                  iov_attr.iov_as = NULL;
6807 6807                  iov_attr.iov = iov_arr;
6808 6808                  iov_attr.iov_buf = NULL;
6809 6809                  iov_attr.iov_list_len = nmblks;
6810 6810                  iov_attr.iov_wr_nds = state->id_max_sqseg;
6811 6811                  iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6812 6812                  iov_attr.iov_flags = IBT_IOV_SLEEP;
6813 6813  
6814 6814                  for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6815 6815                          iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6816 6816                          iov_arr[i].iov_len = MBLKL(nmp);
6817 6817                          if (i == 0) {
6818 6818                                  iov_arr[i].iov_addr += pending_hdr;
6819 6819                                  iov_arr[i].iov_len -= pending_hdr;
6820 6820                          }
6821 6821                  }
6822 6822  
6823 6823                  node->w_buftype = IBD_WQE_MAPPED;
6824 6824                  node->w_swr.wr_sgl = node->w_sgl;
6825 6825  
6826 6826                  ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6827 6827                      (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6828 6828                  if (ibt_status != IBT_SUCCESS) {
6829 6829                          ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6830 6830                              "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6831 6831                          goto ibd_copy_path;
6832 6832                  }
6833 6833  
6834 6834                  return (0);
6835 6835          }
6836 6836  
6837 6837  ibd_copy_path:
6838 6838          if (pktsize <= state->id_tx_buf_sz) {
6839 6839                  node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6840 6840                  node->w_swr.wr_nds = 1;
6841 6841                  node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6842 6842                  node->w_buftype = IBD_WQE_TXBUF;
6843 6843  
6844 6844                  /*
6845 6845                   * Even though this is the copy path for transfers less than
6846 6846                   * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6847 6847                   * is possible the first data mblk fragment (data_mp) still
6848 6848                   * contains part of the LSO header that we need to skip.
6849 6849                   */
6850 6850                  bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6851 6851                  for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6852 6852                          blksize = MBLKL(nmp) - pending_hdr;
6853 6853                          bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6854 6854                          bufp += blksize;
6855 6855                          pending_hdr = 0;
6856 6856                  }
6857 6857  
6858 6858                  return (0);
6859 6859          }
6860 6860  
6861 6861          /*
6862 6862           * Copy path for transfers greater than id_tx_buf_sz
6863 6863           */
6864 6864          node->w_swr.wr_sgl = node->w_sgl;
6865 6865          if (ibd_acquire_lsobufs(state, pktsize,
6866 6866              node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6867 6867                  DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6868 6868                  return (-1);
6869 6869          }
6870 6870          node->w_buftype = IBD_WQE_LSOBUF;
6871 6871  
6872 6872          /*
6873 6873           * Copy the larger-than-id_tx_buf_sz packet into a set of
6874 6874           * fixed-sized, pre-mapped LSO buffers. Note that we might
6875 6875           * need to skip part of the LSO header in the first fragment
6876 6876           * as before.
6877 6877           */
6878 6878          nmp = data_mp;
6879 6879          skip = pending_hdr;
6880 6880          for (i = 0; i < node->w_swr.wr_nds; i++) {
6881 6881                  sgl = node->w_swr.wr_sgl + i;
6882 6882                  bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6883 6883                  avail = IBD_LSO_BUFSZ;
6884 6884                  while (nmp && avail) {
6885 6885                          blksize = MBLKL(nmp) - skip;
6886 6886                          if (blksize > avail) {
6887 6887                                  bcopy(nmp->b_rptr + skip, bufp, avail);
6888 6888                                  skip += avail;
6889 6889                                  avail = 0;
6890 6890                          } else {
6891 6891                                  bcopy(nmp->b_rptr + skip, bufp, blksize);
6892 6892                                  skip = 0;
6893 6893                                  avail -= blksize;
6894 6894                                  bufp += blksize;
6895 6895                                  nmp = nmp->b_cont;
6896 6896                          }
6897 6897                  }
6898 6898          }
6899 6899  
6900 6900          return (0);
6901 6901  }
6902 6902  
6903 6903  /*
6904 6904   * Schedule a completion queue polling to reap the resource we're
6905 6905   * short on.  If we implement the change to reap tx completions
6906 6906   * in a separate thread, we'll need to wake up that thread here.
6907 6907   */
6908 6908  static int
6909 6909  ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6910 6910  {
6911 6911          ibd_req_t *req;
6912 6912  
6913 6913          mutex_enter(&state->id_sched_lock);
6914 6914          state->id_sched_needed |= resource_type;
6915 6915          mutex_exit(&state->id_sched_lock);
6916 6916  
6917 6917          /*
6918 6918           * If we are asked to queue a work entry, we need to do it
6919 6919           */
6920 6920          if (q_flag) {
6921 6921                  req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6922 6922                  if (req == NULL)
6923 6923                          return (-1);
6924 6924  
6925 6925                  ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6926 6926          }
6927 6927  
6928 6928          return (0);
6929 6929  }
6930 6930  
6931 6931  /*
6932 6932   * The passed in packet has this format:
6933 6933   * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6934 6934   */
6935 6935  static boolean_t
6936 6936  ibd_send(ibd_state_t *state, mblk_t *mp)
6937 6937  {
6938 6938          ibd_ace_t *ace;
6939 6939          ibd_swqe_t *node;
6940 6940          ipoib_mac_t *dest;
6941 6941          ib_header_info_t *ipibp;
6942 6942          ip6_t *ip6h;
6943 6943          uint_t pktsize;
6944 6944          uint32_t mss;
6945 6945          uint32_t hckflags;
6946 6946          uint32_t lsoflags = 0;
6947 6947          uint_t lsohdr_sz = 0;
6948 6948          int ret, len;
6949 6949          boolean_t dofree = B_FALSE;
6950 6950          boolean_t rc;
6951 6951          /* if (rc_chan == NULL) send by UD; else send by RC; */
6952 6952          ibd_rc_chan_t *rc_chan;
6953 6953          int nmblks;
6954 6954          mblk_t *nmp;
6955 6955  
6956 6956          /*
6957 6957           * If we aren't done with the device initialization and start,
6958 6958           * we shouldn't be here.
6959 6959           */
6960 6960          if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6961 6961                  return (B_FALSE);
6962 6962  
6963 6963          /*
6964 6964           * Obtain an address handle for the destination.
6965 6965           */
6966 6966          ipibp = (ib_header_info_t *)mp->b_rptr;
6967 6967          dest = (ipoib_mac_t *)&ipibp->ib_dst;
6968 6968          if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6969 6969                  IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6970 6970  
6971 6971          rc_chan = NULL;
6972 6972          ace = ibd_acache_lookup(state, dest, &ret, 1);
6973 6973          if (state->id_enable_rc && (ace != NULL) &&
6974 6974              (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6975 6975                  if (ace->ac_chan == NULL) {
6976 6976                          state->rc_null_conn++;
6977 6977                  } else {
6978 6978                          if (ace->ac_chan->chan_state ==
6979 6979                              IBD_RC_STATE_ACT_ESTAB) {
6980 6980                                  rc_chan = ace->ac_chan;
6981 6981                                  rc_chan->is_used = B_TRUE;
6982 6982                                  mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6983 6983                                  node = WQE_TO_SWQE(
6984 6984                                      rc_chan->tx_wqe_list.dl_head);
6985 6985                                  if (node != NULL) {
6986 6986                                          rc_chan->tx_wqe_list.dl_cnt -= 1;
6987 6987                                          rc_chan->tx_wqe_list.dl_head =
6988 6988                                              node->swqe_next;
6989 6989                                  } else {
6990 6990                                          node = ibd_rc_acquire_swqes(rc_chan);
6991 6991                                  }
6992 6992                                  mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6993 6993  
6994 6994                                  if (node == NULL) {
6995 6995                                          state->rc_swqe_short++;
6996 6996                                          mutex_enter(&state->id_sched_lock);
6997 6997                                          state->id_sched_needed |=
6998 6998                                              IBD_RSRC_RC_SWQE;
6999 6999                                          mutex_exit(&state->id_sched_lock);
7000 7000                                          ibd_dec_ref_ace(state, ace);
7001 7001                                          return (B_FALSE);
7002 7002                                  }
7003 7003                          } else {
7004 7004                                  state->rc_no_estab_conn++;
7005 7005                          }
7006 7006                  }
7007 7007          }
7008 7008  
7009 7009          if (rc_chan == NULL) {
7010 7010                  mutex_enter(&state->id_tx_list.dl_mutex);
7011 7011                  node = WQE_TO_SWQE(state->id_tx_list.dl_head);
7012 7012                  if (node != NULL) {
7013 7013                          state->id_tx_list.dl_cnt -= 1;
7014 7014                          state->id_tx_list.dl_head = node->swqe_next;
7015 7015                  } else {
7016 7016                          node = ibd_acquire_swqe(state);
7017 7017                  }
7018 7018                  mutex_exit(&state->id_tx_list.dl_mutex);
7019 7019                  if (node == NULL) {
7020 7020                          /*
7021 7021                           * If we don't have an swqe available, schedule a
7022 7022                           * transmit completion queue cleanup and hold off on
7023 7023                           * sending more packets until we have some free swqes
7024 7024                           */
7025 7025                          if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
7026 7026                                  if (ace != NULL) {
7027 7027                                          ibd_dec_ref_ace(state, ace);
7028 7028                                  }
7029 7029                                  return (B_FALSE);
7030 7030                          }
7031 7031  
7032 7032                          /*
7033 7033                           * If a poll cannot be scheduled, we have no choice but
7034 7034                           * to drop this packet
7035 7035                           */
7036 7036                          ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
7037 7037                          if (ace != NULL) {
7038 7038                                  ibd_dec_ref_ace(state, ace);
7039 7039                          }
7040 7040                          return (B_TRUE);
7041 7041                  }
7042 7042          }
7043 7043  
7044 7044          /*
7045 7045           * Initialize the commonly used fields in swqe to NULL to protect
7046 7046           * against ibd_tx_cleanup accidentally misinterpreting these on a
7047 7047           * failure.
7048 7048           */
7049 7049          node->swqe_im_mblk = NULL;
7050 7050          node->w_swr.wr_nds = 0;
7051 7051          node->w_swr.wr_sgl = NULL;
7052 7052          node->w_swr.wr_opcode = IBT_WRC_SEND;
7053 7053  
7054 7054          /*
7055 7055           * Calculate the size of message data and number of msg blocks
7056 7056           */
7057 7057          pktsize = 0;
7058 7058          for (nmblks = 0, nmp = mp; nmp != NULL;
7059 7059              nmp = nmp->b_cont, nmblks++) {
7060 7060                  pktsize += MBLKL(nmp);
7061 7061          }
7062 7062  
7063 7063          if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7064 7064                  atomic_inc_64(&state->id_brd_xmt);
7065 7065          else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7066 7066                  atomic_inc_64(&state->id_multi_xmt);
7067 7067  
7068 7068          if (ace != NULL) {
7069 7069                  node->w_ahandle = ace;
7070 7070                  node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7071 7071          } else {
7072 7072                  DPRINT(5,
7073 7073                      "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7074 7074                      ((ret == EFAULT) ? "failed" : "queued"),
7075 7075                      htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7076 7076                      htonl(dest->ipoib_gidpref[1]),
7077 7077                      htonl(dest->ipoib_gidsuff[0]),
7078 7078                      htonl(dest->ipoib_gidsuff[1]));
7079 7079                  state->rc_ace_not_found++;
7080 7080                  node->w_ahandle = NULL;
7081 7081  
7082 7082                  /*
7083 7083                   * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7084 7084                   * can not find a path for the specific dest address. We
7085 7085                   * should get rid of this kind of packet.  We also should get
7086 7086                   * rid of the packet if we cannot schedule a poll via the
7087 7087                   * async thread.  For the normal case, ibd will return the
7088 7088                   * packet to upper layer and wait for AH creating.
7089 7089                   *
7090 7090                   * Note that we always queue a work slot entry for the async
7091 7091                   * thread when we fail AH lookup (even in intr mode); this is
7092 7092                   * due to the convoluted way the code currently looks for AH.
7093 7093                   */
7094 7094                  if (ret == EFAULT) {
7095 7095                          dofree = B_TRUE;
7096 7096                          rc = B_TRUE;
7097 7097                  } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7098 7098                          dofree = B_TRUE;
7099 7099                          rc = B_TRUE;
7100 7100                  } else {
7101 7101                          dofree = B_FALSE;
7102 7102                          rc = B_FALSE;
7103 7103                  }
7104 7104                  goto ibd_send_fail;
7105 7105          }
7106 7106  
7107 7107          /*
7108 7108           * For ND6 packets, padding is at the front of the source lladdr.
7109 7109           * Insert the padding at front.
7110 7110           */
7111 7111          if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7112 7112                  if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7113 7113                          if (!pullupmsg(mp, IPV6_HDR_LEN +
7114 7114                              sizeof (ib_header_info_t))) {
7115 7115                                  DPRINT(10, "ibd_send: pullupmsg failure ");
7116 7116                                  dofree = B_TRUE;
7117 7117                                  rc = B_TRUE;
7118 7118                                  goto ibd_send_fail;
7119 7119                          }
7120 7120                          ipibp = (ib_header_info_t *)mp->b_rptr;
7121 7121                  }
7122 7122                  ip6h = (ip6_t *)((uchar_t *)ipibp +
7123 7123                      sizeof (ib_header_info_t));
7124 7124                  len = ntohs(ip6h->ip6_plen);
7125 7125                  if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7126 7126                          mblk_t  *pad;
7127 7127  
7128 7128                          pad = allocb(4, 0);
7129 7129                          pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7130 7130                          linkb(mp, pad);
7131 7131                          if (MBLKL(mp) < sizeof (ib_header_info_t) +
7132 7132                              IPV6_HDR_LEN + len + 4) {
7133 7133                                  if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7134 7134                                      IPV6_HDR_LEN + len + 4)) {
7135 7135                                          DPRINT(10, "ibd_send: pullupmsg "
7136 7136                                              "failure ");
7137 7137                                          dofree = B_TRUE;
7138 7138                                          rc = B_TRUE;
7139 7139                                          goto ibd_send_fail;
7140 7140                                  }
7141 7141                                  ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7142 7142                                      sizeof (ib_header_info_t));
7143 7143                          }
7144 7144  
7145 7145                          /* LINTED: E_CONSTANT_CONDITION */
7146 7146                          IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7147 7147                  }
7148 7148          }
7149 7149  
7150 7150          ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7151 7151          mp->b_rptr += sizeof (ib_addrs_t);
7152 7152          pktsize -= sizeof (ib_addrs_t);
7153 7153  
7154 7154          if (rc_chan) {  /* send in RC mode */
7155 7155                  ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7156 7156                  ibt_iov_attr_t iov_attr;
7157 7157                  uint_t          i;
7158 7158                  size_t  blksize;
7159 7159                  uchar_t *bufp;
7160 7160                  ibd_rc_tx_largebuf_t *lbufp;
7161 7161  
7162 7162                  atomic_add_64(&state->rc_xmt_bytes, pktsize);
7163 7163  
7164 7164                  /*
7165 7165                   * Upper layer does Tx checksum, we don't need do any
7166 7166                   * checksum here.
7167 7167                   */
7168 7168                  ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7169 7169  
7170 7170                  /*
7171 7171                   * We only do ibt_map_mem_iov() if the pktsize is above
7172 7172                   * the "copy-threshold", and if the number of mp
7173 7173                   * fragments is less than the maximum acceptable.
7174 7174                   */
7175 7175                  if (pktsize <= state->id_rc_tx_copy_thresh) {
7176 7176                          atomic_inc_64(&state->rc_xmt_small_pkt);
7177 7177                          /*
7178 7178                           * Only process unicast packet in Reliable Connected
7179 7179                           * mode.
7180 7180                           */
7181 7181                          node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7182 7182                          node->w_swr.wr_nds = 1;
7183 7183                          node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7184 7184                          node->w_buftype = IBD_WQE_TXBUF;
7185 7185  
7186 7186                          bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7187 7187                          for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7188 7188                                  blksize = MBLKL(nmp);
7189 7189                                  bcopy(nmp->b_rptr, bufp, blksize);
7190 7190                                  bufp += blksize;
7191 7191                          }
7192 7192                          freemsg(mp);
7193 7193                          ASSERT(node->swqe_im_mblk == NULL);
7194 7194                  } else {
7195 7195                          if ((state->rc_enable_iov_map) &&
7196 7196                              (nmblks < state->rc_max_sqseg_hiwm)) {
7197 7197  
7198 7198                                  /* do ibt_map_mem_iov() */
7199 7199                                  iov_attr.iov_as = NULL;
7200 7200                                  iov_attr.iov = iov_arr;
7201 7201                                  iov_attr.iov_buf = NULL;
7202 7202                                  iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7203 7203                                  iov_attr.iov_lso_hdr_sz = 0;
7204 7204                                  iov_attr.iov_flags = IBT_IOV_SLEEP;
7205 7205  
7206 7206                                  i = 0;
7207 7207                                  for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7208 7208                                          iov_arr[i].iov_len = MBLKL(nmp);
7209 7209                                          if (iov_arr[i].iov_len != 0) {
7210 7210                                                  iov_arr[i].iov_addr = (caddr_t)
7211 7211                                                      (void *)nmp->b_rptr;
7212 7212                                                  i++;
7213 7213                                          }
7214 7214                                  }
7215 7215                                  iov_attr.iov_list_len = i;
7216 7216                                  node->w_swr.wr_sgl = node->w_sgl;
7217 7217  
7218 7218                                  ret = ibt_map_mem_iov(state->id_hca_hdl,
7219 7219                                      &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7220 7220                                      &node->w_mi_hdl);
7221 7221                                  if (ret != IBT_SUCCESS) {
7222 7222                                          atomic_inc_64(
7223 7223                                              &state->rc_xmt_map_fail_pkt);
7224 7224                                          DPRINT(30, "ibd_send: ibt_map_mem_iov("
7225 7225                                              ") failed, nmblks=%d, real_nmblks"
7226 7226                                              "=%d, ret=0x%x", nmblks, i, ret);
7227 7227                                          goto ibd_rc_large_copy;
7228 7228                                  }
7229 7229  
7230 7230                                  atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7231 7231                                  node->w_buftype = IBD_WQE_MAPPED;
7232 7232                                  node->swqe_im_mblk = mp;
7233 7233                          } else {
7234 7234                                  atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7235 7235  ibd_rc_large_copy:
7236 7236                                  mutex_enter(&state->rc_tx_large_bufs_lock);
7237 7237                                  if (state->rc_tx_largebuf_nfree == 0) {
7238 7238                                          state->rc_xmt_buf_short++;
7239 7239                                          mutex_exit
7240 7240                                              (&state->rc_tx_large_bufs_lock);
7241 7241                                          mutex_enter(&state->id_sched_lock);
7242 7242                                          state->id_sched_needed |=
7243 7243                                              IBD_RSRC_RC_TX_LARGEBUF;
7244 7244                                          mutex_exit(&state->id_sched_lock);
7245 7245                                          dofree = B_FALSE;
7246 7246                                          rc = B_FALSE;
7247 7247                                          /*
7248 7248                                           * If we don't have Tx large bufs,
7249 7249                                           * return failure. node->w_buftype
7250 7250                                           * should not be IBD_WQE_RC_COPYBUF,
7251 7251                                           * otherwise it will cause problem
7252 7252                                           * in ibd_rc_tx_cleanup()
7253 7253                                           */
7254 7254                                          node->w_buftype = IBD_WQE_TXBUF;
7255 7255                                          goto ibd_send_fail;
7256 7256                                  }
7257 7257  
7258 7258                                  lbufp = state->rc_tx_largebuf_free_head;
7259 7259                                  ASSERT(lbufp->lb_buf != NULL);
7260 7260                                  state->rc_tx_largebuf_free_head =
7261 7261                                      lbufp->lb_next;
7262 7262                                  lbufp->lb_next = NULL;
7263 7263                                  /* Update nfree count */
7264 7264                                  state->rc_tx_largebuf_nfree --;
7265 7265                                  mutex_exit(&state->rc_tx_large_bufs_lock);
7266 7266                                  bufp = lbufp->lb_buf;
7267 7267                                  node->w_sgl[0].ds_va =
7268 7268                                      (ib_vaddr_t)(uintptr_t)bufp;
7269 7269                                  node->w_sgl[0].ds_key =
7270 7270                                      state->rc_tx_mr_desc.md_lkey;
7271 7271                                  node->w_sgl[0].ds_len = pktsize;
7272 7272                                  node->w_swr.wr_sgl = node->w_sgl;
7273 7273                                  node->w_swr.wr_nds = 1;
7274 7274                                  node->w_buftype = IBD_WQE_RC_COPYBUF;
7275 7275                                  node->w_rc_tx_largebuf = lbufp;
7276 7276  
7277 7277                                  for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7278 7278                                          blksize = MBLKL(nmp);
7279 7279                                          if (blksize != 0) {
7280 7280                                                  bcopy(nmp->b_rptr, bufp,
7281 7281                                                      blksize);
7282 7282                                                  bufp += blksize;
7283 7283                                          }
7284 7284                                  }
7285 7285                                  freemsg(mp);
7286 7286                                  ASSERT(node->swqe_im_mblk == NULL);
7287 7287                          }
7288 7288                  }
7289 7289  
7290 7290                  node->swqe_next = NULL;
7291 7291                  mutex_enter(&rc_chan->tx_post_lock);
7292 7292                  if (rc_chan->tx_busy) {
7293 7293                          if (rc_chan->tx_head) {
7294 7294                                  rc_chan->tx_tail->swqe_next =
7295 7295                                      SWQE_TO_WQE(node);
7296 7296                          } else {
7297 7297                                  rc_chan->tx_head = node;
7298 7298                          }
7299 7299                          rc_chan->tx_tail = node;
7300 7300                          mutex_exit(&rc_chan->tx_post_lock);
7301 7301                  } else {
7302 7302                          rc_chan->tx_busy = 1;
7303 7303                          mutex_exit(&rc_chan->tx_post_lock);
7304 7304                          ibd_rc_post_send(rc_chan, node);
7305 7305                  }
7306 7306  
7307 7307                  return (B_TRUE);
7308 7308          } /* send by RC */
7309 7309  
7310 7310          if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7311 7311                  /*
7312 7312                   * Too long pktsize. The packet size from GLD should <=
7313 7313                   * state->id_mtu + sizeof (ib_addrs_t)
7314 7314                   */
7315 7315                  if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7316 7316                          ibd_req_t *req;
7317 7317  
7318 7318                          mutex_enter(&ace->tx_too_big_mutex);
7319 7319                          if (ace->tx_too_big_ongoing) {
7320 7320                                  mutex_exit(&ace->tx_too_big_mutex);
7321 7321                                  state->rc_xmt_reenter_too_long_pkt++;
7322 7322                                  dofree = B_TRUE;
7323 7323                          } else {
7324 7324                                  ace->tx_too_big_ongoing = B_TRUE;
7325 7325                                  mutex_exit(&ace->tx_too_big_mutex);
7326 7326                                  state->rc_xmt_icmp_too_long_pkt++;
7327 7327  
7328 7328                                  req = kmem_cache_alloc(state->id_req_kmc,
7329 7329                                      KM_NOSLEEP);
7330 7330                                  if (req == NULL) {
7331 7331                                          ibd_print_warn(state, "ibd_send: alloc "
7332 7332                                              "ibd_req_t fail");
7333 7333                                          /* Drop it. */
7334 7334                                          dofree = B_TRUE;
7335 7335                                  } else {
7336 7336                                          req->rq_ptr = mp;
7337 7337                                          req->rq_ptr2 = ace;
7338 7338                                          ibd_queue_work_slot(state, req,
7339 7339                                              IBD_ASYNC_RC_TOO_BIG);
7340 7340                                          dofree = B_FALSE;
7341 7341                                  }
7342 7342                          }
7343 7343                  } else {
7344 7344                          ibd_print_warn(state, "Reliable Connected mode is on. "
7345 7345                              "Multicast packet length %d > %d is too long to "
7346 7346                              "send packet (%d > %d), drop it",
7347 7347                              pktsize, state->id_mtu);
7348 7348                          state->rc_xmt_drop_too_long_pkt++;
7349 7349                          /* Drop it. */
7350 7350                          dofree = B_TRUE;
7351 7351                  }
7352 7352                  rc = B_TRUE;
7353 7353                  goto ibd_send_fail;
7354 7354          }
7355 7355  
7356 7356          atomic_add_64(&state->id_xmt_bytes, pktsize);
7357 7357          atomic_inc_64(&state->id_xmt_pkt);
7358 7358  
7359 7359          /*
7360 7360           * Do LSO and checksum related work here.  For LSO send, adjust the
7361 7361           * ud destination, the opcode and the LSO header information to the
7362 7362           * work request.
7363 7363           */
7364 7364          mac_lso_get(mp, &mss, &lsoflags);
7365 7365          if ((lsoflags & HW_LSO) != HW_LSO) {
7366 7366                  node->w_swr.wr_opcode = IBT_WRC_SEND;
7367 7367                  lsohdr_sz = 0;
7368 7368          } else {
7369 7369                  if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7370 7370                          /*
7371 7371                           * The routine can only fail if there's no memory; we
7372 7372                           * can only drop the packet if this happens
7373 7373                           */
7374 7374                          ibd_print_warn(state,
7375 7375                              "ibd_send: no memory, lso posting failed");
7376 7376                          dofree = B_TRUE;
7377 7377                          rc = B_TRUE;
7378 7378                          goto ibd_send_fail;
7379 7379                  }
7380 7380  
7381 7381                  node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7382 7382                  lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7383 7383          }
7384 7384  
7385 7385          mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7386 7386          if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7387 7387                  node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7388 7388          else
7389 7389                  node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7390 7390  
7391 7391          /*
7392 7392           * Prepare the sgl for posting; the routine can only fail if there's
7393 7393           * no lso buf available for posting. If this is the case, we should
7394 7394           * probably resched for lso bufs to become available and then try again.
7395 7395           */
7396 7396          if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7397 7397                  if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7398 7398                          dofree = B_TRUE;
7399 7399                          rc = B_TRUE;
7400 7400                  } else {
7401 7401                          dofree = B_FALSE;
7402 7402                          rc = B_FALSE;
7403 7403                  }
7404 7404                  goto ibd_send_fail;
7405 7405          }
7406 7406          node->swqe_im_mblk = mp;
7407 7407  
7408 7408          /*
7409 7409           * Queue the wqe to hardware; since we can now simply queue a
7410 7410           * post instead of doing it serially, we cannot assume anything
7411 7411           * about the 'node' after ibd_post_send() returns.
7412 7412           */
7413 7413          node->swqe_next = NULL;
7414 7414  
7415 7415          mutex_enter(&state->id_txpost_lock);
7416 7416          if (state->id_tx_busy) {
7417 7417                  if (state->id_tx_head) {
7418 7418                          state->id_tx_tail->swqe_next =
7419 7419                              SWQE_TO_WQE(node);
7420 7420                  } else {
7421 7421                          state->id_tx_head = node;
7422 7422                  }
7423 7423                  state->id_tx_tail = node;
7424 7424                  mutex_exit(&state->id_txpost_lock);
7425 7425          } else {
7426 7426                  state->id_tx_busy = 1;
7427 7427                  mutex_exit(&state->id_txpost_lock);
7428 7428                  ibd_post_send(state, node);
7429 7429          }
7430 7430  
7431 7431          return (B_TRUE);
7432 7432  
7433 7433  ibd_send_fail:
7434 7434          if (node && mp)
7435 7435                  ibd_free_lsohdr(node, mp);
7436 7436  
7437 7437          if (dofree)
7438 7438                  freemsg(mp);
7439 7439  
7440 7440          if (node != NULL) {
7441 7441                  if (rc_chan) {
7442 7442                          ibd_rc_tx_cleanup(node);
7443 7443                  } else {
7444 7444                          ibd_tx_cleanup(state, node);
7445 7445                  }
7446 7446          }
7447 7447  
7448 7448          return (rc);
7449 7449  }
7450 7450  
7451 7451  /*
7452 7452   * GLDv3 entry point for transmitting datagram.
7453 7453   */
7454 7454  static mblk_t *
7455 7455  ibd_m_tx(void *arg, mblk_t *mp)
7456 7456  {
7457 7457          ibd_state_t *state = (ibd_state_t *)arg;
7458 7458          mblk_t *next;
7459 7459  
7460 7460          if (state->id_type == IBD_PORT_DRIVER) {
7461 7461                  freemsgchain(mp);
7462 7462                  return (NULL);
7463 7463          }
7464 7464  
7465 7465          if ((state->id_link_state != LINK_STATE_UP) ||
7466 7466              !(state->id_mac_state & IBD_DRV_STARTED)) {
7467 7467                  freemsgchain(mp);
7468 7468                  mp = NULL;
7469 7469          }
7470 7470  
7471 7471          while (mp != NULL) {
7472 7472                  next = mp->b_next;
7473 7473                  mp->b_next = NULL;
7474 7474                  if (ibd_send(state, mp) == B_FALSE) {
7475 7475                          /* Send fail */
7476 7476                          mp->b_next = next;
7477 7477                          break;
7478 7478                  }
7479 7479                  mp = next;
7480 7480          }
7481 7481  
7482 7482          return (mp);
7483 7483  }
7484 7484  
7485 7485  /*
7486 7486   * this handles Tx and Rx completions. With separate CQs, this handles
7487 7487   * only Rx completions.
7488 7488   */
7489 7489  static uint_t
7490 7490  ibd_intr(caddr_t arg)
7491 7491  {
7492 7492          ibd_state_t *state = (ibd_state_t *)arg;
7493 7493  
7494 7494          ibd_poll_rcq(state, state->id_rcq_hdl);
7495 7495  
7496 7496          return (DDI_INTR_CLAIMED);
7497 7497  }
7498 7498  
7499 7499  /*
7500 7500   * Poll and fully drain the send cq
7501 7501   */
7502 7502  static void
7503 7503  ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7504 7504  {
7505 7505          ibt_wc_t *wcs = state->id_txwcs;
7506 7506          uint_t numwcs = state->id_txwcs_size;
7507 7507          ibd_wqe_t *wqe;
7508 7508          ibd_swqe_t *head, *tail;
7509 7509          ibt_wc_t *wc;
7510 7510          uint_t num_polled;
7511 7511          int i;
7512 7512  
7513 7513          while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7514 7514                  head = tail = NULL;
7515 7515                  for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7516 7516                          wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7517 7517                          if (wc->wc_status != IBT_WC_SUCCESS) {
7518 7518                                  /*
7519 7519                                   * Channel being torn down.
7520 7520                                   */
7521 7521                                  if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7522 7522                                          DPRINT(5, "ibd_drain_scq: flush error");
7523 7523                                          DPRINT(10, "ibd_drain_scq: Bad "
7524 7524                                              "status %d", wc->wc_status);
7525 7525                                  } else {
7526 7526                                          DPRINT(10, "ibd_drain_scq: "
7527 7527                                              "unexpected wc_status %d",
7528 7528                                              wc->wc_status);
7529 7529                                  }
7530 7530                                  /*
7531 7531                                   * Fallthrough to invoke the Tx handler to
7532 7532                                   * release held resources, e.g., AH refcount.
7533 7533                                   */
7534 7534                          }
7535 7535                          /*
7536 7536                           * Add this swqe to the list to be cleaned up.
7537 7537                           */
7538 7538                          if (head)
7539 7539                                  tail->swqe_next = wqe;
7540 7540                          else
7541 7541                                  head = WQE_TO_SWQE(wqe);
7542 7542                          tail = WQE_TO_SWQE(wqe);
7543 7543                  }
7544 7544                  tail->swqe_next = NULL;
7545 7545                  ibd_tx_cleanup_list(state, head, tail);
7546 7546  
7547 7547                  /*
7548 7548                   * Resume any blocked transmissions if possible
7549 7549                   */
7550 7550                  ibd_resume_transmission(state);
7551 7551          }
7552 7552  }
7553 7553  
7554 7554  /*
7555 7555   * Poll and fully drain the receive cq
7556 7556   */
7557 7557  static void
7558 7558  ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7559 7559  {
7560 7560          ibt_wc_t *wcs = state->id_rxwcs;
7561 7561          uint_t numwcs = state->id_rxwcs_size;
7562 7562          ibd_rwqe_t *rwqe;
7563 7563          ibt_wc_t *wc;
7564 7564          uint_t num_polled;
7565 7565          int i;
7566 7566          mblk_t *head, *tail, *mp;
7567 7567  
7568 7568          while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7569 7569                  head = tail = NULL;
7570 7570                  for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7571 7571                          rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7572 7572                          if (wc->wc_status != IBT_WC_SUCCESS) {
7573 7573                                  /*
7574 7574                                   * Channel being torn down.
7575 7575                                   */
7576 7576                                  if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7577 7577                                          DPRINT(5, "ibd_drain_rcq: "
7578 7578                                              "expected flushed rwqe");
7579 7579                                  } else {
7580 7580                                          DPRINT(5, "ibd_drain_rcq: "
7581 7581                                              "unexpected wc_status %d",
7582 7582                                              wc->wc_status);
7583 7583                                  }
7584 7584                                  atomic_inc_32(
7585 7585                                      &state->id_rx_list.dl_bufs_outstanding);
7586 7586                                  freemsg(rwqe->rwqe_im_mblk);
7587 7587                                  continue;
7588 7588                          }
7589 7589                          mp = ibd_process_rx(state, rwqe, wc);
7590 7590                          if (mp == NULL)
7591 7591                                  continue;
7592 7592  
7593 7593                          /*
7594 7594                           * Add this mp to the list to send to the nw layer.
7595 7595                           */
7596 7596                          if (head)
7597 7597                                  tail->b_next = mp;
7598 7598                          else
7599 7599                                  head = mp;
7600 7600                          tail = mp;
7601 7601                  }
7602 7602                  if (head)
7603 7603                          mac_rx(state->id_mh, state->id_rh, head);
7604 7604  
7605 7605                  /*
7606 7606                   * Account for #rwqes polled.
7607 7607                   * Post more here, if less than one fourth full.
7608 7608                   */
7609 7609                  if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7610 7610                      (state->id_ud_num_rwqe / 4))
7611 7611                          ibd_post_recv_intr(state);
7612 7612          }
7613 7613  }
7614 7614  
7615 7615  /*
7616 7616   * Common code for interrupt handling as well as for polling
7617 7617   * for all completed wqe's while detaching.
7618 7618   */
7619 7619  static void
7620 7620  ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7621 7621  {
7622 7622          int flag, redo_flag;
7623 7623          int redo = 1;
7624 7624  
7625 7625          flag = IBD_CQ_POLLING;
7626 7626          redo_flag = IBD_REDO_CQ_POLLING;
7627 7627  
7628 7628          mutex_enter(&state->id_scq_poll_lock);
7629 7629          if (state->id_scq_poll_busy & flag) {
7630 7630                  ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7631 7631                  state->id_scq_poll_busy |= redo_flag;
7632 7632                  mutex_exit(&state->id_scq_poll_lock);
7633 7633                  return;
7634 7634          }
7635 7635          state->id_scq_poll_busy |= flag;
7636 7636          mutex_exit(&state->id_scq_poll_lock);
7637 7637  
7638 7638          /*
7639 7639           * In some cases (eg detaching), this code can be invoked on
7640 7640           * any cpu after disabling cq notification (thus no concurrency
7641 7641           * exists). Apart from that, the following applies normally:
7642 7642           * Transmit completion handling could be from any cpu if
7643 7643           * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7644 7644           * is interrupt driven.
7645 7645           */
7646 7646  
7647 7647          /*
7648 7648           * Poll and drain the CQ
7649 7649           */
7650 7650          ibd_drain_scq(state, cq_hdl);
7651 7651  
7652 7652          /*
7653 7653           * Enable CQ notifications and redrain the cq to catch any
7654 7654           * completions we might have missed after the ibd_drain_scq()
7655 7655           * above and before the ibt_enable_cq_notify() that follows.
7656 7656           * Finally, service any new requests to poll the cq that
7657 7657           * could've come in after the ibt_enable_cq_notify().
7658 7658           */
7659 7659          do {
7660 7660                  if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7661 7661                      IBT_SUCCESS) {
7662 7662                          DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7663 7663                  }
7664 7664  
7665 7665                  ibd_drain_scq(state, cq_hdl);
7666 7666  
7667 7667                  mutex_enter(&state->id_scq_poll_lock);
7668 7668                  if (state->id_scq_poll_busy & redo_flag)
7669 7669                          state->id_scq_poll_busy &= ~redo_flag;
7670 7670                  else {
7671 7671                          state->id_scq_poll_busy &= ~flag;
7672 7672                          redo = 0;
7673 7673                  }
7674 7674                  mutex_exit(&state->id_scq_poll_lock);
7675 7675  
7676 7676          } while (redo);
7677 7677  }
7678 7678  
7679 7679  /*
7680 7680   * Common code for interrupt handling as well as for polling
7681 7681   * for all completed wqe's while detaching.
7682 7682   */
7683 7683  static void
7684 7684  ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7685 7685  {
7686 7686          int flag, redo_flag;
7687 7687          int redo = 1;
7688 7688  
7689 7689          flag = IBD_CQ_POLLING;
7690 7690          redo_flag = IBD_REDO_CQ_POLLING;
7691 7691  
7692 7692          mutex_enter(&state->id_rcq_poll_lock);
7693 7693          if (state->id_rcq_poll_busy & flag) {
7694 7694                  ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7695 7695                  state->id_rcq_poll_busy |= redo_flag;
7696 7696                  mutex_exit(&state->id_rcq_poll_lock);
7697 7697                  return;
7698 7698          }
7699 7699          state->id_rcq_poll_busy |= flag;
7700 7700          mutex_exit(&state->id_rcq_poll_lock);
7701 7701  
7702 7702          /*
7703 7703           * Poll and drain the CQ
7704 7704           */
7705 7705          ibd_drain_rcq(state, rcq);
7706 7706  
7707 7707          /*
7708 7708           * Enable CQ notifications and redrain the cq to catch any
7709 7709           * completions we might have missed after the ibd_drain_cq()
7710 7710           * above and before the ibt_enable_cq_notify() that follows.
7711 7711           * Finally, service any new requests to poll the cq that
7712 7712           * could've come in after the ibt_enable_cq_notify().
7713 7713           */
7714 7714          do {
7715 7715                  if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7716 7716                      IBT_SUCCESS) {
7717 7717                          DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7718 7718                  }
7719 7719  
7720 7720                  ibd_drain_rcq(state, rcq);
7721 7721  
7722 7722                  mutex_enter(&state->id_rcq_poll_lock);
7723 7723                  if (state->id_rcq_poll_busy & redo_flag)
7724 7724                          state->id_rcq_poll_busy &= ~redo_flag;
7725 7725                  else {
7726 7726                          state->id_rcq_poll_busy &= ~flag;
7727 7727                          redo = 0;
7728 7728                  }
7729 7729                  mutex_exit(&state->id_rcq_poll_lock);
7730 7730  
7731 7731          } while (redo);
7732 7732  }
7733 7733  
7734 7734  /*
7735 7735   * Unmap the memory area associated with a given swqe.
7736 7736   */
7737 7737  void
7738 7738  ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7739 7739  {
7740 7740          ibt_status_t stat;
7741 7741  
7742 7742          DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7743 7743  
7744 7744          if (swqe->w_mi_hdl) {
7745 7745                  if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7746 7746                      swqe->w_mi_hdl)) != IBT_SUCCESS) {
7747 7747                          DPRINT(10,
7748 7748                              "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7749 7749                  }
7750 7750                  swqe->w_mi_hdl = NULL;
7751 7751          }
7752 7752          swqe->w_swr.wr_nds = 0;
7753 7753  }
7754 7754  
7755 7755  void
7756 7756  ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7757 7757  {
7758 7758          /*
7759 7759           * The recycling logic can be eliminated from here
7760 7760           * and put into the async thread if we create another
7761 7761           * list to hold ACE's for unjoined mcg's.
7762 7762           */
7763 7763          if (DEC_REF_DO_CYCLE(ace)) {
7764 7764                  ibd_mce_t *mce;
7765 7765  
7766 7766                  /*
7767 7767                   * Check with the lock taken: we decremented
7768 7768                   * reference count without the lock, and some
7769 7769                   * transmitter might already have bumped the
7770 7770                   * reference count (possible in case of multicast
7771 7771                   * disable when we leave the AH on the active
7772 7772                   * list). If not still 0, get out, leaving the
7773 7773                   * recycle bit intact.
7774 7774                   *
7775 7775                   * Atomically transition the AH from active
7776 7776                   * to free list, and queue a work request to
7777 7777                   * leave the group and destroy the mce. No
7778 7778                   * transmitter can be looking at the AH or
7779 7779                   * the MCE in between, since we have the
7780 7780                   * ac_mutex lock. In the SendOnly reap case,
7781 7781                   * it is not necessary to hold the ac_mutex
7782 7782                   * and recheck the ref count (since the AH was
7783 7783                   * taken off the active list), we just do it
7784 7784                   * to have uniform processing with the Full
7785 7785                   * reap case.
7786 7786                   */
7787 7787                  mutex_enter(&state->id_ac_mutex);
7788 7788                  mce = ace->ac_mce;
7789 7789                  if (GET_REF_CYCLE(ace) == 0) {
7790 7790                          CLEAR_REFCYCLE(ace);
7791 7791                          /*
7792 7792                           * Identify the case of fullmember reap as
7793 7793                           * opposed to mcg trap reap. Also, port up
7794 7794                           * might set ac_mce to NULL to indicate Tx
7795 7795                           * cleanup should do no more than put the
7796 7796                           * AH in the free list (see ibd_async_link).
7797 7797                           */
7798 7798                          if (mce != NULL) {
7799 7799                                  ace->ac_mce = NULL;
7800 7800                                  IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7801 7801                                  /*
7802 7802                                   * mc_req was initialized at mce
7803 7803                                   * creation time.
7804 7804                                   */
7805 7805                                  ibd_queue_work_slot(state,
7806 7806                                      &mce->mc_req, IBD_ASYNC_REAP);
7807 7807                          }
7808 7808                          IBD_ACACHE_INSERT_FREE(state, ace);
7809 7809                  }
7810 7810                  mutex_exit(&state->id_ac_mutex);
7811 7811          }
7812 7812  }
7813 7813  
7814 7814  /*
7815 7815   * Common code that deals with clean ups after a successful or
7816 7816   * erroneous transmission attempt.
7817 7817   */
7818 7818  static void
7819 7819  ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7820 7820  {
7821 7821          ibd_ace_t *ace = swqe->w_ahandle;
7822 7822  
7823 7823          DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7824 7824  
7825 7825          /*
7826 7826           * If this was a dynamic mapping in ibd_send(), we need to
7827 7827           * unmap here. If this was an lso buffer we'd used for sending,
7828 7828           * we need to release the lso buf to the pool, since the resource
7829 7829           * is scarce. However, if this was simply a normal send using
7830 7830           * the copybuf (present in each swqe), we don't need to release it.
7831 7831           */
7832 7832          if (swqe->swqe_im_mblk != NULL) {
7833 7833                  if (swqe->w_buftype == IBD_WQE_MAPPED) {
7834 7834                          ibd_unmap_mem(state, swqe);
7835 7835                  } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7836 7836                          ibd_release_lsobufs(state,
7837 7837                              swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7838 7838                  }
7839 7839                  ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7840 7840                  freemsg(swqe->swqe_im_mblk);
7841 7841                  swqe->swqe_im_mblk = NULL;
7842 7842          }
7843 7843  
7844 7844          /*
7845 7845           * Drop the reference count on the AH; it can be reused
7846 7846           * now for a different destination if there are no more
7847 7847           * posted sends that will use it. This can be eliminated
7848 7848           * if we can always associate each Tx buffer with an AH.
7849 7849           * The ace can be null if we are cleaning up from the
7850 7850           * ibd_send() error path.
7851 7851           */
7852 7852          if (ace != NULL) {
7853 7853                  ibd_dec_ref_ace(state, ace);
7854 7854          }
7855 7855  
7856 7856          /*
7857 7857           * Release the send wqe for reuse.
7858 7858           */
7859 7859          swqe->swqe_next = NULL;
7860 7860          ibd_release_swqe(state, swqe, swqe, 1);
7861 7861  }
7862 7862  
7863 7863  static void
7864 7864  ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7865 7865  {
7866 7866          ibd_ace_t *ace;
7867 7867          ibd_swqe_t *swqe;
7868 7868          int n = 0;
7869 7869  
7870 7870          DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7871 7871  
7872 7872          for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7873 7873  
7874 7874                  /*
7875 7875                   * If this was a dynamic mapping in ibd_send(), we need to
7876 7876                   * unmap here. If this was an lso buffer we'd used for sending,
7877 7877                   * we need to release the lso buf to the pool, since the
7878 7878                   * resource is scarce. However, if this was simply a normal
7879 7879                   * send using the copybuf (present in each swqe), we don't need
7880 7880                   * to release it.
7881 7881                   */
7882 7882                  if (swqe->swqe_im_mblk != NULL) {
7883 7883                          if (swqe->w_buftype == IBD_WQE_MAPPED) {
7884 7884                                  ibd_unmap_mem(state, swqe);
7885 7885                          } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7886 7886                                  ibd_release_lsobufs(state,
7887 7887                                      swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7888 7888                          }
7889 7889                          ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7890 7890                          freemsg(swqe->swqe_im_mblk);
7891 7891                          swqe->swqe_im_mblk = NULL;
7892 7892                  }
7893 7893  
7894 7894                  /*
7895 7895                   * Drop the reference count on the AH; it can be reused
7896 7896                   * now for a different destination if there are no more
7897 7897                   * posted sends that will use it. This can be eliminated
7898 7898                   * if we can always associate each Tx buffer with an AH.
7899 7899                   * The ace can be null if we are cleaning up from the
7900 7900                   * ibd_send() error path.
7901 7901                   */
7902 7902                  ace = swqe->w_ahandle;
7903 7903                  if (ace != NULL) {
7904 7904                          ibd_dec_ref_ace(state, ace);
7905 7905                  }
7906 7906                  n++;
7907 7907          }
7908 7908  
7909 7909          /*
7910 7910           * Release the send wqes for reuse.
7911 7911           */
7912 7912          ibd_release_swqe(state, head, tail, n);
7913 7913  }
7914 7914  
7915 7915  /*
7916 7916   * Processing to be done after receipt of a packet; hand off to GLD
7917 7917   * in the format expected by GLD.  The received packet has this
7918 7918   * format: 2b sap :: 00 :: data.
7919 7919   */
7920 7920  static mblk_t *
7921 7921  ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7922 7922  {
7923 7923          ib_header_info_t *phdr;
7924 7924          mblk_t *mp;
7925 7925          ipoib_hdr_t *ipibp;
7926 7926          ipha_t *iphap;
7927 7927          ip6_t *ip6h;
7928 7928          int len;
7929 7929          ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7930 7930          uint32_t bufs;
7931 7931  
7932 7932          /*
7933 7933           * Track number handed to upper layer that need to be returned.
7934 7934           */
7935 7935          bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7936 7936  
7937 7937          /* Never run out of rwqes, use allocb when running low */
7938 7938          if (bufs >= state->id_rx_bufs_outstanding_limit) {
7939 7939                  atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7940 7940                  atomic_inc_32(&state->id_rx_allocb);
7941 7941                  mp = allocb(pkt_len, BPRI_HI);
7942 7942                  if (mp) {
7943 7943                          bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7944 7944                          ibd_post_recv(state, rwqe);
7945 7945                  } else {        /* no memory */
7946 7946                          atomic_inc_32(&state->id_rx_allocb_failed);
7947 7947                          ibd_post_recv(state, rwqe);
7948 7948                          return (NULL);
7949 7949                  }
7950 7950          } else {
7951 7951                  mp = rwqe->rwqe_im_mblk;
7952 7952          }
7953 7953  
7954 7954  
7955 7955          /*
7956 7956           * Adjust write pointer depending on how much data came in.
7957 7957           */
7958 7958          mp->b_wptr = mp->b_rptr + pkt_len;
7959 7959  
7960 7960          /*
7961 7961           * Make sure this is NULL or we're in trouble.
7962 7962           */
7963 7963          if (mp->b_next != NULL) {
7964 7964                  ibd_print_warn(state,
7965 7965                      "ibd_process_rx: got duplicate mp from rcq?");
7966 7966                  mp->b_next = NULL;
7967 7967          }
7968 7968  
7969 7969          /*
7970 7970           * the IB link will deliver one of the IB link layer
7971 7971           * headers called, the Global Routing Header (GRH).
7972 7972           * ibd driver uses the information in GRH to build the
7973 7973           * Header_info structure and pass it with the datagram up
7974 7974           * to GLDv3.
7975 7975           * If the GRH is not valid, indicate to GLDv3 by setting
7976 7976           * the VerTcFlow field to 0.
7977 7977           */
7978 7978          phdr = (ib_header_info_t *)mp->b_rptr;
7979 7979          if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7980 7980                  phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7981 7981  
7982 7982                  /* if it is loop back packet, just drop it. */
7983 7983                  if (state->id_enable_rc) {
7984 7984                          if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7985 7985                              &state->rc_macaddr_loopback,
7986 7986                              IPOIB_ADDRL) == 0) {
7987 7987                                  freemsg(mp);
7988 7988                                  return (NULL);
7989 7989                          }
7990 7990                  } else {
7991 7991                          if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7992 7992                              IPOIB_ADDRL) == 0) {
7993 7993                                  freemsg(mp);
7994 7994                                  return (NULL);
7995 7995                          }
7996 7996                  }
7997 7997  
7998 7998                  ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7999 7999                      sizeof (ipoib_mac_t));
8000 8000                  if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
8001 8001                          phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
8002 8002                          IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
8003 8003                  } else {
8004 8004                          phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
8005 8005                  }
8006 8006          } else {
8007 8007                  /*
8008 8008                   * It can not be a IBA multicast packet. Must have been
8009 8009                   * unicast for us. Just copy the interface address to dst.
8010 8010                   */
8011 8011                  phdr->ib_grh.ipoib_vertcflow = 0;
8012 8012                  ovbcopy(&state->id_macaddr, &phdr->ib_dst,
8013 8013                      sizeof (ipoib_mac_t));
8014 8014          }
8015 8015  
8016 8016          /*
8017 8017           * For ND6 packets, padding is at the front of the source/target
8018 8018           * lladdr. However the inet6 layer is not aware of it, hence remove
8019 8019           * the padding from such packets.
8020 8020           */
8021 8021          ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
8022 8022          if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
8023 8023                  ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8024 8024                  len = ntohs(ip6h->ip6_plen);
8025 8025                  if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8026 8026                          /* LINTED: E_CONSTANT_CONDITION */
8027 8027                          IBD_PAD_NSNA(ip6h, len, IBD_RECV);
8028 8028                  }
8029 8029          }
8030 8030  
8031 8031          /*
8032 8032           * Update statistics
8033 8033           */
8034 8034          atomic_add_64(&state->id_rcv_bytes, pkt_len);
8035 8035          atomic_inc_64(&state->id_rcv_pkt);
8036 8036          if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
8037 8037                  atomic_inc_64(&state->id_brd_rcv);
8038 8038          else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
8039 8039                  atomic_inc_64(&state->id_multi_rcv);
8040 8040  
8041 8041          iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8042 8042          /*
8043 8043           * Set receive checksum status in mp
8044 8044           * Hardware checksumming can be considered valid only if:
8045 8045           * 1. CQE.IP_OK bit is set
8046 8046           * 2. CQE.CKSUM = 0xffff
8047 8047           * 3. IPv6 routing header is not present in the packet
8048 8048           * 4. If there are no IP_OPTIONS in the IP HEADER
8049 8049           */
8050 8050  
8051 8051          if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
8052 8052              (wc->wc_cksum == 0xFFFF) &&
8053 8053              (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
8054 8054                  mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
8055 8055          }
8056 8056  
8057 8057          return (mp);
8058 8058  }
8059 8059  
8060 8060  /*
8061 8061   * Callback code invoked from STREAMs when the receive data buffer is
8062 8062   * free for recycling.
8063 8063   */
8064 8064  static void
8065 8065  ibd_freemsg_cb(char *arg)
8066 8066  {
8067 8067          ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8068 8068          ibd_state_t *state = rwqe->w_state;
8069 8069  
8070 8070          atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8071 8071  
8072 8072          /*
8073 8073           * If the driver is stopped, just free the rwqe.
8074 8074           */
8075 8075          if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8076 8076                  DPRINT(6, "ibd_freemsg: wqe being freed");
8077 8077                  rwqe->rwqe_im_mblk = NULL;
8078 8078                  ibd_free_rwqe(state, rwqe);
8079 8079                  return;
8080 8080          }
8081 8081  
8082 8082          rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8083 8083              state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8084 8084          if (rwqe->rwqe_im_mblk == NULL) {
8085 8085                  ibd_free_rwqe(state, rwqe);
8086 8086                  DPRINT(6, "ibd_freemsg: desballoc failed");
8087 8087                  return;
8088 8088          }
8089 8089  
8090 8090          ibd_post_recv(state, rwqe);
8091 8091  }
8092 8092  
8093 8093  static uint_t
8094 8094  ibd_tx_recycle(caddr_t arg)
8095 8095  {
8096 8096          ibd_state_t *state = (ibd_state_t *)arg;
8097 8097  
8098 8098          /*
8099 8099           * Poll for completed entries
8100 8100           */
8101 8101          ibd_poll_scq(state, state->id_scq_hdl);
8102 8102  
8103 8103          return (DDI_INTR_CLAIMED);
8104 8104  }
8105 8105  
8106 8106  #ifdef IBD_LOGGING
8107 8107  static void
8108 8108  ibd_log_init(void)
8109 8109  {
8110 8110          ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8111 8111          ibd_lbuf_ndx = 0;
8112 8112  
8113 8113          mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8114 8114  }
8115 8115  
8116 8116  static void
8117 8117  ibd_log_fini(void)
8118 8118  {
8119 8119          if (ibd_lbuf)
8120 8120                  kmem_free(ibd_lbuf, IBD_LOG_SZ);
8121 8121          ibd_lbuf_ndx = 0;
8122 8122          ibd_lbuf = NULL;
8123 8123  
8124 8124          mutex_destroy(&ibd_lbuf_lock);
8125 8125  }
8126 8126  
8127 8127  static void
8128 8128  ibd_log(const char *fmt, ...)
8129 8129  {
8130 8130          va_list ap;
8131 8131          uint32_t off;
8132 8132          uint32_t msglen;
8133 8133          char tmpbuf[IBD_DMAX_LINE];
8134 8134  
8135 8135          if (ibd_lbuf == NULL)
8136 8136                  return;
8137 8137  
8138 8138          va_start(ap, fmt);
8139 8139          msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8140 8140          va_end(ap);
8141 8141  
8142 8142          if (msglen >= IBD_DMAX_LINE)
8143 8143                  msglen = IBD_DMAX_LINE - 1;
8144 8144  
8145 8145          mutex_enter(&ibd_lbuf_lock);
8146 8146  
8147 8147          off = ibd_lbuf_ndx;             /* current msg should go here */
8148 8148          if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8149 8149                  ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8150 8150  
8151 8151          ibd_lbuf_ndx += msglen;         /* place where next msg should start */
8152 8152          ibd_lbuf[ibd_lbuf_ndx] = 0;     /* current msg should terminate */
8153 8153  
8154 8154          if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8155 8155                  ibd_lbuf_ndx = 0;
8156 8156  
8157 8157          mutex_exit(&ibd_lbuf_lock);
8158 8158  
8159 8159          bcopy(tmpbuf, ibd_lbuf+off, msglen);    /* no lock needed for this */
8160 8160  }
8161 8161  #endif
8162 8162  
8163 8163  /* ARGSUSED */
8164 8164  static int
8165 8165  ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8166 8166      int *rvalp)
8167 8167  {
8168 8168          ibd_create_ioctl_t      *cmd = karg;
8169 8169          ibd_state_t             *state, *port_state, *p;
8170 8170          int                     i, err, rval = 0;
8171 8171          mac_register_t          *macp;
8172 8172          ibt_hca_portinfo_t      *pinfop = NULL;
8173 8173          ibt_status_t            ibt_status;
8174 8174          uint_t                  psize, pinfosz;
8175 8175          boolean_t               force_create = B_FALSE;
8176 8176  
8177 8177          cmd->ibdioc.ioc_status = 0;
8178 8178  
8179 8179          if (cmd->ibdioc.ioc_port_inst < 0) {
8180 8180                  cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8181 8181                  return (EINVAL);
8182 8182          }
8183 8183          port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8184 8184          if (port_state == NULL) {
8185 8185                  DPRINT(10, "ibd_create_partition: failed to get state %d",
8186 8186                      cmd->ibdioc.ioc_port_inst);
8187 8187                  cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8188 8188                  return (EINVAL);
8189 8189          }
8190 8190  
8191 8191          /* Limited PKeys not supported */
8192 8192          if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8193 8193                  rval = EINVAL;
8194 8194                  goto part_create_return;
8195 8195          }
8196 8196  
8197 8197          if (cmd->ioc_force_create == 0) {
8198 8198                  /*
8199 8199                   * Check if the port pkey table contains the pkey for which
8200 8200                   * this partition is being created.
8201 8201                   */
8202 8202                  ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8203 8203                      port_state->id_port, &pinfop, &psize, &pinfosz);
8204 8204  
8205 8205                  if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8206 8206                          rval = EINVAL;
8207 8207                          goto part_create_return;
8208 8208                  }
8209 8209  
8210 8210                  if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8211 8211                          rval = ENETDOWN;
8212 8212                          cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8213 8213                          goto part_create_return;
8214 8214                  }
8215 8215  
8216 8216                  for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8217 8217                          if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8218 8218                                  break;
8219 8219                          }
8220 8220                  }
8221 8221                  if (i == pinfop->p_pkey_tbl_sz) {
8222 8222                          rval = EINVAL;
8223 8223                          cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8224 8224                          goto part_create_return;
8225 8225                  }
8226 8226          } else {
8227 8227                  force_create = B_TRUE;
8228 8228          }
8229 8229  
8230 8230          mutex_enter(&ibd_objlist_lock);
8231 8231          for (p = ibd_objlist_head; p; p = p->id_next) {
8232 8232                  if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8233 8233                      (p->id_pkey == cmd->ioc_pkey) &&
8234 8234                      (p->id_plinkid == cmd->ioc_partid)) {
8235 8235                          mutex_exit(&ibd_objlist_lock);
8236 8236                          rval = EEXIST;
8237 8237                          cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8238 8238                          goto part_create_return;
8239 8239                  }
8240 8240          }
8241 8241          mutex_exit(&ibd_objlist_lock);
8242 8242  
8243 8243          state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8244 8244  
8245 8245          state->id_type          = IBD_PARTITION_OBJ;
8246 8246  
8247 8247          state->id_plinkid       = cmd->ioc_partid;
8248 8248          state->id_dlinkid       = cmd->ibdioc.ioc_linkid;
8249 8249          state->id_port_inst     = cmd->ibdioc.ioc_port_inst;
8250 8250  
8251 8251          state->id_dip           = port_state->id_dip;
8252 8252          state->id_port          = port_state->id_port;
8253 8253          state->id_pkey          = cmd->ioc_pkey;
8254 8254          state->id_hca_guid      = port_state->id_hca_guid;
8255 8255          state->id_port_guid     = port_state->id_port_guid;
8256 8256          state->id_force_create  = force_create;
8257 8257  
8258 8258          mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8259 8259          cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8260 8260  
8261 8261          if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8262 8262                  rval = EIO;
8263 8263                  cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8264 8264                  goto fail;
8265 8265          }
8266 8266  
8267 8267          if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8268 8268                  rval = EAGAIN;
8269 8269                  goto fail;
8270 8270          }
8271 8271  
8272 8272          macp->m_type_ident      = MAC_PLUGIN_IDENT_IB;
8273 8273          macp->m_dip             = port_state->id_dip;
8274 8274          macp->m_instance        = (uint_t)-1;
8275 8275          macp->m_driver          = state;
8276 8276          macp->m_src_addr        = (uint8_t *)&state->id_macaddr;
8277 8277          macp->m_callbacks       = &ibd_m_callbacks;
8278 8278          macp->m_min_sdu         = 0;
8279 8279          macp->m_multicast_sdu   = IBD_DEF_MAX_SDU;
8280 8280          if (state->id_enable_rc) {
8281 8281                  macp->m_max_sdu         = IBD_DEF_RC_MAX_SDU;
8282 8282          } else {
8283 8283                  macp->m_max_sdu         = IBD_DEF_MAX_SDU;
8284 8284          }
8285 8285          macp->m_priv_props = ibd_priv_props;
8286 8286  
8287 8287          err = mac_register(macp, &state->id_mh);
8288 8288          mac_free(macp);
8289 8289  
8290 8290          if (err != 0) {
8291 8291                  DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8292 8292                      err);
8293 8293                  rval = err;
8294 8294                  goto fail;
8295 8295          }
8296 8296  
8297 8297          err = dls_devnet_create(state->id_mh,
8298 8298              cmd->ioc_partid, crgetzoneid(credp));
8299 8299          if (err != 0) {
8300 8300                  DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8301 8301                      "%d", err);
8302 8302                  rval = err;
8303 8303                  (void) mac_unregister(state->id_mh);
8304 8304                  goto fail;
8305 8305          }
8306 8306  
8307 8307          /*
8308 8308           * Add the new partition state structure to the list
8309 8309           */
8310 8310          mutex_enter(&ibd_objlist_lock);
8311 8311          if (ibd_objlist_head)
8312 8312                  state->id_next = ibd_objlist_head;
8313 8313  
8314 8314          ibd_objlist_head = state;
8315 8315          mutex_exit(&ibd_objlist_lock);
8316 8316  
8317 8317  part_create_return:
8318 8318          if (pinfop) {
8319 8319                  ibt_free_portinfo(pinfop, pinfosz);
8320 8320          }
8321 8321          return (rval);
8322 8322  
8323 8323  fail:
8324 8324          if (pinfop) {
8325 8325                  ibt_free_portinfo(pinfop, pinfosz);
8326 8326          }
8327 8327          ibd_part_unattach(state);
8328 8328          kmem_free(state, sizeof (ibd_state_t));
8329 8329          return (rval);
8330 8330  }
8331 8331  
8332 8332  /* ARGSUSED */
8333 8333  static int
8334 8334  ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8335 8335      int *rvalp)
8336 8336  {
8337 8337          int err;
8338 8338          datalink_id_t tmpid;
8339 8339          ibd_state_t *node, *prev;
8340 8340          ibd_delete_ioctl_t *cmd = karg;
8341 8341  
8342 8342          prev = NULL;
8343 8343  
8344 8344          mutex_enter(&ibd_objlist_lock);
8345 8345          node = ibd_objlist_head;
8346 8346  
8347 8347          /* Find the ibd state structure corresponding to the partition */
8348 8348          while (node != NULL) {
8349 8349                  if (node->id_plinkid == cmd->ioc_partid)
8350 8350                          break;
8351 8351                  prev = node;
8352 8352                  node = node->id_next;
8353 8353          }
8354 8354  
8355 8355          if (node == NULL) {
8356 8356                  mutex_exit(&ibd_objlist_lock);
8357 8357                  return (ENOENT);
8358 8358          }
8359 8359  
8360 8360          if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8361 8361                  DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8362 8362                      "%d", err);
8363 8363                  mutex_exit(&ibd_objlist_lock);
8364 8364                  return (err);
8365 8365          }
8366 8366  
8367 8367          /*
8368 8368           * Call ibd_part_unattach() only after making sure that the instance has
8369 8369           * not been started yet and is also not in late hca init mode.
8370 8370           */
8371 8371          ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8372 8372  
8373 8373          err = 0;
8374 8374          if ((node->id_mac_state & IBD_DRV_STARTED) ||
8375 8375              (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8376 8376              (ibd_part_busy(node) != DDI_SUCCESS) ||
8377 8377              ((err = mac_disable(node->id_mh)) != 0)) {
8378 8378                  (void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8379 8379                      crgetzoneid(credp));
8380 8380                  ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8381 8381                  mutex_exit(&ibd_objlist_lock);
8382 8382                  return (err != 0 ? err : EBUSY);
8383 8383          }
8384 8384  
8385 8385          node->id_mac_state |= IBD_DRV_IN_DELETION;
8386 8386  
8387 8387          ibd_part_unattach(node);
8388 8388  
8389 8389          ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8390 8390  
8391 8391          /* Remove the partition state structure from the linked list */
8392 8392          if (prev == NULL)
8393 8393                  ibd_objlist_head = node->id_next;
8394 8394          else
8395 8395                  prev->id_next = node->id_next;
8396 8396          mutex_exit(&ibd_objlist_lock);
8397 8397  
8398 8398          if ((err = mac_unregister(node->id_mh)) != 0) {
8399 8399                  DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8400 8400                      err);
8401 8401          }
8402 8402  
8403 8403          cv_destroy(&node->id_macst_cv);
8404 8404          mutex_destroy(&node->id_macst_lock);
8405 8405  
8406 8406          kmem_free(node, sizeof (ibd_state_t));
8407 8407  
8408 8408          return (0);
8409 8409  }
8410 8410  
8411 8411  /* ARGSUSED */
8412 8412  static int
8413 8413  ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8414 8414      int *rvalp)
8415 8415  {
8416 8416          ibd_ioctl_t             cmd;
8417 8417          ibpart_ioctl_t          partioc;
8418 8418          ibport_ioctl_t          portioc;
8419 8419  #ifdef _MULTI_DATAMODEL
8420 8420          ibport_ioctl32_t        portioc32;
8421 8421  #endif
8422 8422          ibd_state_t             *state, *port_state;
8423 8423          int                     size;
8424 8424          ibt_hca_portinfo_t      *pinfop = NULL;
8425 8425          ibt_status_t            ibt_status;
8426 8426          uint_t                  psize, pinfosz;
8427 8427          int                     rval = 0;
8428 8428  
8429 8429          size = sizeof (ibd_ioctl_t);
8430 8430          if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8431 8431                  return (EFAULT);
8432 8432          }
8433 8433          cmd.ioc_status = 0;
8434 8434          switch (cmd.ioc_info_cmd) {
8435 8435          case IBD_INFO_CMD_IBPART:
8436 8436                  size = sizeof (ibpart_ioctl_t);
8437 8437                  if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8438 8438                          return (EFAULT);
8439 8439                  }
8440 8440  
8441 8441                  mutex_enter(&ibd_objlist_lock);
8442 8442                  /* Find the ibd state structure corresponding the partition */
8443 8443                  for (state = ibd_objlist_head; state; state = state->id_next) {
8444 8444                          if (state->id_plinkid == cmd.ioc_linkid) {
8445 8445                                  break;
8446 8446                          }
8447 8447                  }
8448 8448  
8449 8449                  if (state == NULL) {
8450 8450                          mutex_exit(&ibd_objlist_lock);
8451 8451                          return (ENOENT);
8452 8452                  }
8453 8453  
8454 8454                  partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8455 8455                  partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8456 8456                  partioc.ibdioc.ioc_portnum = state->id_port;
8457 8457                  partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8458 8458                  partioc.ibdioc.ioc_portguid = state->id_port_guid;
8459 8459                  partioc.ibdioc.ioc_status = 0;
8460 8460                  partioc.ioc_partid = state->id_plinkid;
8461 8461                  partioc.ioc_pkey = state->id_pkey;
8462 8462                  partioc.ioc_force_create = state->id_force_create;
8463 8463                  if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8464 8464                          mutex_exit(&ibd_objlist_lock);
8465 8465                          return (EFAULT);
8466 8466                  }
8467 8467                  mutex_exit(&ibd_objlist_lock);
8468 8468  
8469 8469                  break;
8470 8470  
8471 8471          case IBD_INFO_CMD_IBPORT:
8472 8472                  if ((cmd.ioc_port_inst < 0) || ((port_state =
8473 8473                      ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8474 8474                          DPRINT(10, "ibd_create_partition: failed to get"
8475 8475                              " state %d", cmd.ioc_port_inst);
8476 8476                          size = sizeof (ibd_ioctl_t);
8477 8477                          cmd.ioc_status = IBD_INVALID_PORT_INST;
8478 8478                          if (ddi_copyout((void *)&cmd, (void *)arg, size,
8479 8479                              mode)) {
8480 8480                                  return (EFAULT);
8481 8481                          }
8482 8482                          return (EINVAL);
8483 8483                  }
8484 8484                  ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8485 8485                      port_state->id_port, &pinfop, &psize, &pinfosz);
8486 8486                  if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8487 8487                          return (EINVAL);
8488 8488                  }
8489 8489  #ifdef _MULTI_DATAMODEL
8490 8490                  switch (ddi_model_convert_from(mode & FMODELS)) {
8491 8491                  case DDI_MODEL_ILP32: {
8492 8492                          size = sizeof (ibport_ioctl32_t);
8493 8493                          if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8494 8494                                  rval = EFAULT;
8495 8495                                  goto fail;
8496 8496                          }
8497 8497                          portioc32.ibdioc.ioc_status = 0;
8498 8498                          portioc32.ibdioc.ioc_portnum = port_state->id_port;
8499 8499                          portioc32.ibdioc.ioc_hcaguid =
8500 8500                              port_state->id_hca_guid;
8501 8501                          portioc32.ibdioc.ioc_portguid =
8502 8502                              port_state->id_port_guid;
8503 8503                          if (portioc32.ioc_pkey_tbl_sz !=
8504 8504                              pinfop->p_pkey_tbl_sz) {
8505 8505                                  rval = EINVAL;
8506 8506                                  size = sizeof (ibd_ioctl_t);
8507 8507                                  portioc32.ibdioc.ioc_status =
8508 8508                                      IBD_INVALID_PKEY_TBL_SIZE;
8509 8509                                  if (ddi_copyout((void *)&portioc32.ibdioc,
8510 8510                                      (void *)arg, size, mode)) {
8511 8511                                          rval = EFAULT;
8512 8512                                          goto fail;
8513 8513                                  }
8514 8514                                  goto fail;
8515 8515                          }
8516 8516                          size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8517 8517                          if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8518 8518                              (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8519 8519                              mode)) {
8520 8520                                  rval = EFAULT;
8521 8521                                  goto fail;
8522 8522                          }
8523 8523                          size = sizeof (ibport_ioctl32_t);
8524 8524                          if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8525 8525                              mode)) {
8526 8526                                  rval = EFAULT;
8527 8527                                  goto fail;
8528 8528                          }
8529 8529                          break;
8530 8530                  }
8531 8531                  case DDI_MODEL_NONE:
8532 8532                          size = sizeof (ibport_ioctl_t);
8533 8533                          if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8534 8534                                  rval = EFAULT;
8535 8535                                  goto fail;
8536 8536                          }
8537 8537                          portioc.ibdioc.ioc_status = 0;
8538 8538                          portioc.ibdioc.ioc_portnum = port_state->id_port;
8539 8539                          portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8540 8540                          portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8541 8541                          if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8542 8542                                  rval = EINVAL;
8543 8543                                  size = sizeof (ibd_ioctl_t);
8544 8544                                  portioc.ibdioc.ioc_status =
8545 8545                                      IBD_INVALID_PKEY_TBL_SIZE;
8546 8546                                  if (ddi_copyout((void *)&portioc.ibdioc,
8547 8547                                      (void *)arg, size, mode)) {
8548 8548                                          rval = EFAULT;
8549 8549                                          goto fail;
8550 8550                                  }
8551 8551                                  goto fail;
8552 8552                          }
8553 8553                          size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8554 8554                          if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8555 8555                              (void *)(portioc.ioc_pkeys), size, mode)) {
8556 8556                                  rval = EFAULT;
8557 8557                                  goto fail;
8558 8558                          }
8559 8559                          size = sizeof (ibport_ioctl_t);
8560 8560                          if (ddi_copyout((void *)&portioc, (void *)arg, size,
8561 8561                              mode)) {
8562 8562                                  rval = EFAULT;
8563 8563                                  goto fail;
8564 8564                          }
8565 8565                          break;
8566 8566                  }
8567 8567  #else /* ! _MULTI_DATAMODEL */
8568 8568                  size = sizeof (ibport_ioctl_t);
8569 8569                  if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8570 8570                          rval = EFAULT;
8571 8571                          goto fail;
8572 8572                  }
8573 8573                  portioc.ibdioc.ioc_status = 0;
8574 8574                  portioc.ibdioc.ioc_portnum = port_state->id_port;
8575 8575                  portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8576 8576                  portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8577 8577                  if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8578 8578                          rval = EINVAL;
8579 8579                          size = sizeof (ibd_ioctl_t);
8580 8580                          portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8581 8581                          if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8582 8582                              size, mode)) {
8583 8583                                  rval = EFAULT;
8584 8584                                  goto fail;
8585 8585                          }
8586 8586                          goto fail;
8587 8587                  }
8588 8588                  size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8589 8589                  if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8590 8590                      (void *)(portioc.ioc_pkeys), size, mode)) {
8591 8591                          rval = EFAULT;
8592 8592                          goto fail;
8593 8593                  }
8594 8594                  size = sizeof (ibport_ioctl_t);
8595 8595                  if (ddi_copyout((void *)&portioc, (void *)arg, size,
8596 8596                      mode)) {
8597 8597                          rval = EFAULT;
8598 8598                          goto fail;
8599 8599                  }
8600 8600  #endif /* _MULTI_DATAMODEL */
8601 8601  
8602 8602                  break;
8603 8603  
8604 8604          case IBD_INFO_CMD_PKEYTBLSZ:
8605 8605                  if ((cmd.ioc_port_inst < 0) || ((port_state =
8606 8606                      ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8607 8607                          DPRINT(10, "ibd_create_partition: failed to get"
8608 8608                              " state %d", cmd.ioc_port_inst);
8609 8609                          size = sizeof (ibd_ioctl_t);
8610 8610                          cmd.ioc_status = IBD_INVALID_PORT_INST;
8611 8611                          if (ddi_copyout((void *)&cmd, (void *)arg, size,
8612 8612                              mode)) {
8613 8613                                  return (EFAULT);
8614 8614                          }
8615 8615                          return (EINVAL);
8616 8616                  }
8617 8617                  ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8618 8618                      port_state->id_port, &pinfop, &psize, &pinfosz);
8619 8619                  if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8620 8620                          return (EINVAL);
8621 8621                  }
8622 8622  #ifdef _MULTI_DATAMODEL
8623 8623                  switch (ddi_model_convert_from(mode & FMODELS)) {
8624 8624                  case DDI_MODEL_ILP32: {
8625 8625                          size = sizeof (ibport_ioctl32_t);
8626 8626                          if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8627 8627                                  rval = EFAULT;
8628 8628                                  goto fail;
8629 8629                          }
8630 8630                          portioc32.ibdioc.ioc_status = 0;
8631 8631                          portioc32.ibdioc.ioc_portnum = port_state->id_port;
8632 8632                          portioc32.ibdioc.ioc_hcaguid =
8633 8633                              port_state->id_hca_guid;
8634 8634                          portioc32.ibdioc.ioc_portguid =
8635 8635                              port_state->id_port_guid;
8636 8636                          portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8637 8637                          if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8638 8638                              mode)) {
8639 8639                                  rval = EFAULT;
8640 8640                                  goto fail;
8641 8641                          }
8642 8642                          break;
8643 8643                  }
8644 8644                  case DDI_MODEL_NONE:
8645 8645                          size = sizeof (ibport_ioctl_t);
8646 8646                          if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8647 8647                                  rval = EFAULT;
8648 8648                                  goto fail;
8649 8649                          }
8650 8650                          portioc.ibdioc.ioc_status = 0;
8651 8651                          portioc.ibdioc.ioc_portnum = port_state->id_port;
8652 8652                          portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8653 8653                          portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8654 8654                          portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8655 8655                          if (ddi_copyout((void *)&portioc, (void *)arg, size,
8656 8656                              mode)) {
8657 8657                                  rval = EFAULT;
8658 8658                                  goto fail;
8659 8659                          }
8660 8660                          break;
8661 8661                  }
8662 8662  #else /* ! _MULTI_DATAMODEL */
8663 8663                  size = sizeof (ibport_ioctl_t);
8664 8664                  if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8665 8665                          rval = EFAULT;
8666 8666                          goto fail;
8667 8667                  }
8668 8668                  portioc.ibdioc.ioc_status = 0;
8669 8669                  portioc.ibdioc.ioc_portnum = port_state->id_port;
8670 8670                  portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8671 8671                  portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8672 8672                  portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8673 8673                  if (ddi_copyout((void *)&portioc, (void *)arg, size,
8674 8674                      mode)) {
8675 8675                          rval = EFAULT;
8676 8676                          goto fail;
8677 8677                  }
8678 8678  #endif /* _MULTI_DATAMODEL */
8679 8679                  break;
8680 8680  
8681 8681          default:
8682 8682                  return (EINVAL);
8683 8683  
8684 8684          } /* switch (cmd.ioc_info_cmd) */
8685 8685  fail:
8686 8686          if (pinfop) {
8687 8687                  ibt_free_portinfo(pinfop, pinfosz);
8688 8688          }
8689 8689          return (rval);
8690 8690  }
8691 8691  
8692 8692  /* ARGSUSED */
8693 8693  static void
8694 8694  ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8695 8695      ibt_async_code_t code, ibt_async_event_t *event)
8696 8696  {
8697 8697          ibd_state_t *state = (ibd_state_t *)arg;
8698 8698          link_state_t    lstate;
8699 8699  
8700 8700          switch (code) {
8701 8701          case IBT_EVENT_PORT_UP:
8702 8702          case IBT_ERROR_PORT_DOWN:
8703 8703                  if (ibd_get_port_state(state, &lstate) != 0)
8704 8704                          break;
8705 8705  
8706 8706                  if (state->id_link_state != lstate) {
8707 8707                          state->id_link_state = lstate;
8708 8708                          mac_link_update(state->id_mh, lstate);
8709 8709                  }
8710 8710                  break;
8711 8711          default:
8712 8712                  break;
8713 8713          }
8714 8714  }
8715 8715  
8716 8716  static int
8717 8717  ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8718 8718  {
8719 8719          ibt_hca_portinfo_t *port_infop;
8720 8720          uint_t psize, port_infosz;
8721 8721          ibt_status_t    ret;
8722 8722  
8723 8723          ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8724 8724              &port_infop, &psize, &port_infosz);
8725 8725          if ((ret != IBT_SUCCESS) || (psize != 1))
8726 8726                  return (-1);
8727 8727  
8728 8728          state->id_sgid = *port_infop->p_sgid_tbl;
8729 8729          state->id_link_speed = ibd_get_portspeed(state);
8730 8730  
8731 8731          if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8732 8732                  *lstate = LINK_STATE_UP;
8733 8733          else
8734 8734                  *lstate = LINK_STATE_DOWN;
8735 8735  
8736 8736          ibt_free_portinfo(port_infop, port_infosz);
8737 8737          return (0);
8738 8738  }
8739 8739  
8740 8740  static int
8741 8741  ibd_port_attach(dev_info_t *dip)
8742 8742  {
8743 8743          ibd_state_t             *state;
8744 8744          link_state_t            lstate;
8745 8745          int                     instance;
8746 8746          ibt_status_t            ret;
8747 8747  
8748 8748          /*
8749 8749           * Allocate softstate structure
8750 8750           */
8751 8751          instance = ddi_get_instance(dip);
8752 8752          if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8753 8753                  DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8754 8754                  return (DDI_FAILURE);
8755 8755          }
8756 8756  
8757 8757          state = ddi_get_soft_state(ibd_list, instance);
8758 8758  
8759 8759          state->id_dip = dip;
8760 8760          state->id_type = IBD_PORT_DRIVER;
8761 8761  
8762 8762          if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8763 8763              "port-number", 0)) == 0) {
8764 8764                  DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8765 8765                      state->id_port);
8766 8766                  return (DDI_FAILURE);
8767 8767          }
8768 8768          if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8769 8769              "hca-guid", 0)) == 0) {
8770 8770                  DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8771 8771                      state->id_hca_guid);
8772 8772                  return (DDI_FAILURE);
8773 8773          }
8774 8774          if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8775 8775              "port-guid", 0)) == 0) {
8776 8776                  DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8777 8777                      state->id_port_guid);
8778 8778                  return (DDI_FAILURE);
8779 8779          }
8780 8780  
8781 8781          /*
8782 8782           * Attach to IBTL
8783 8783           */
8784 8784          if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8785 8785              &state->id_ibt_hdl)) != IBT_SUCCESS) {
8786 8786                  DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8787 8787                      ret);
8788 8788                  goto done;
8789 8789          }
8790 8790  
8791 8791          state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8792 8792  
8793 8793          if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8794 8794              &state->id_hca_hdl)) != IBT_SUCCESS) {
8795 8795                  DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8796 8796                      ret);
8797 8797                  goto done;
8798 8798          }
8799 8799          state->id_mac_state |= IBD_DRV_HCA_OPENED;
8800 8800  
8801 8801          /* Update link status */
8802 8802  
8803 8803          if (ibd_get_port_state(state, &lstate) != 0) {
8804 8804                  DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8805 8805                      ret);
8806 8806                  goto done;
8807 8807          }
8808 8808          state->id_link_state = lstate;
8809 8809          /*
8810 8810           * Register ibd interfaces with the Nemo framework
8811 8811           */
8812 8812          if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8813 8813                  DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8814 8814                  goto done;
8815 8815          }
8816 8816          state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8817 8817  
8818 8818          mac_link_update(state->id_mh, lstate);
8819 8819  
8820 8820          return (DDI_SUCCESS);
8821 8821  done:
8822 8822          (void) ibd_port_unattach(state, dip);
8823 8823          return (DDI_FAILURE);
8824 8824  }
8825 8825  
8826 8826  static int
8827 8827  ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8828 8828  {
8829 8829          int instance;
8830 8830          uint32_t progress = state->id_mac_state;
8831 8831          ibt_status_t ret;
8832 8832  
8833 8833          if (progress & IBD_DRV_MAC_REGISTERED) {
8834 8834                  (void) mac_unregister(state->id_mh);
8835 8835                  state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8836 8836          }
8837 8837  
8838 8838          if (progress & IBD_DRV_HCA_OPENED) {
8839 8839                  if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8840 8840                      IBT_SUCCESS) {
8841 8841                          ibd_print_warn(state, "failed to close "
8842 8842                              "HCA device, ret=%d", ret);
8843 8843                  }
8844 8844                  state->id_hca_hdl = NULL;
8845 8845                  state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8846 8846          }
8847 8847  
8848 8848          if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8849 8849                  if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8850 8850                          ibd_print_warn(state,
8851 8851                              "ibt_detach() failed, ret=%d", ret);
8852 8852                  }
8853 8853                  state->id_ibt_hdl = NULL;
8854 8854                  state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8855 8855          }
8856 8856          instance = ddi_get_instance(dip);
8857 8857          ddi_soft_state_free(ibd_list, instance);
8858 8858  
8859 8859          return (DDI_SUCCESS);
8860 8860  }
8861 8861  
8862 8862  ibt_status_t
8863 8863  ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8864 8864  {
8865 8865          ibd_state_t     *state;
8866 8866  
8867 8867          mutex_enter(&ibd_objlist_lock);
8868 8868  
8869 8869          /* Find the ibd state structure corresponding the partition */
8870 8870          for (state = ibd_objlist_head; state; state = state->id_next) {
8871 8871                  if (state->id_plinkid == linkid) {
8872 8872                          break;
8873 8873                  }
8874 8874          }
8875 8875  
8876 8876          if (state == NULL) {
8877 8877                  mutex_exit(&ibd_objlist_lock);
8878 8878                  return (IBT_NO_SUCH_OBJECT);
8879 8879          }
8880 8880  
8881 8881          attr->pa_dlinkid = state->id_dlinkid;
8882 8882          attr->pa_plinkid = state->id_plinkid;
8883 8883          attr->pa_port = state->id_port;
8884 8884          attr->pa_hca_guid = state->id_hca_guid;
8885 8885          attr->pa_port_guid = state->id_port_guid;
8886 8886          attr->pa_pkey = state->id_pkey;
8887 8887  
8888 8888          mutex_exit(&ibd_objlist_lock);
8889 8889  
8890 8890          return (IBT_SUCCESS);
8891 8891  }
8892 8892  
8893 8893  ibt_status_t
8894 8894  ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8895 8895  {
8896 8896          ibd_state_t     *state;
8897 8897          int             n = 0;
8898 8898          ibt_part_attr_t *attr;
8899 8899  
8900 8900          mutex_enter(&ibd_objlist_lock);
8901 8901  
8902 8902          for (state = ibd_objlist_head; state; state = state->id_next)
8903 8903                  n++;
8904 8904  
8905 8905          *nparts = n;
8906 8906          if (n == 0) {
8907 8907                  *attr_list = NULL;
8908 8908                  mutex_exit(&ibd_objlist_lock);
8909 8909                  return (IBT_SUCCESS);
8910 8910          }
8911 8911  
8912 8912          *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8913 8913          attr = *attr_list;
8914 8914          for (state = ibd_objlist_head; state; state = state->id_next) {
8915 8915  #ifdef DEBUG
8916 8916                  ASSERT(n > 0);
8917 8917                  n--;
8918 8918  #endif
8919 8919                  attr->pa_dlinkid = state->id_dlinkid;
8920 8920                  attr->pa_plinkid = state->id_plinkid;
8921 8921                  attr->pa_port = state->id_port;
8922 8922                  attr->pa_hca_guid = state->id_hca_guid;
8923 8923                  attr->pa_port_guid = state->id_port_guid;
8924 8924                  attr->pa_pkey = state->id_pkey;
8925 8925                  attr++;
8926 8926          }
8927 8927  
8928 8928          mutex_exit(&ibd_objlist_lock);
8929 8929          return (IBT_SUCCESS);
8930 8930  }
  
    | ↓ open down ↓ | 8454 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX