1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * tavor_misc.c
  29  *    Tavor Miscellaneous routines - Address Handle, Multicast, Protection
  30  *    Domain, and port-related operations
  31  *
  32  *    Implements all the routines necessary for allocating, freeing, querying
  33  *    and modifying Address Handles and Protection Domains.  Also implements
  34  *    all the routines necessary for adding and removing Queue Pairs to/from
  35  *    Multicast Groups.  Lastly, it implements the routines necessary for
  36  *    port-related query and modify operations.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/conf.h>
  41 #include <sys/ddi.h>
  42 #include <sys/sunddi.h>
  43 #include <sys/modctl.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/sysmacros.h>
  46 
  47 #include <sys/ib/adapters/tavor/tavor.h>
  48 
  49 static void tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav,
  50     uint_t flag);
  51 static int tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
  52     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, uint_t *qp_found);
  53 static int tavor_mcg_qplist_remove(tavor_mcghdl_t mcg,
  54     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp);
  55 static void tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp);
  56 static void tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp);
  57 static uint_t tavor_mcg_walk_mgid_hash(tavor_state_t *state,
  58     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
  59 static void tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg,
  60     tavor_hw_mcg_t *mcg_hdr, ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc);
  61 static int tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
  62     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry);
  63 static int tavor_mcg_entry_invalidate(tavor_state_t *state,
  64     tavor_hw_mcg_t *mcg_entry, uint_t indx);
  65 static int tavor_mgid_is_valid(ib_gid_t gid);
  66 static int tavor_mlid_is_valid(ib_lid_t lid);
  67 
  68 
  69 /*
  70  * tavor_ah_alloc()
  71  *    Context: Can be called only from user or kernel context.
  72  */
  73 int
  74 tavor_ah_alloc(tavor_state_t *state, tavor_pdhdl_t pd,
  75     ibt_adds_vect_t *attr_p, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
  76 {
  77         tavor_rsrc_t            *udav, *rsrc;
  78         tavor_hw_udav_t         udav_entry;
  79         tavor_ahhdl_t           ah;
  80         ibt_mr_attr_t           mr_attr;
  81         tavor_mr_options_t      op;
  82         tavor_mrhdl_t           mr;
  83         uint64_t                data;
  84         uint32_t                size;
  85         int                     status, i, flag;
  86         char                    *errormsg;
  87 
  88         TAVOR_TNF_ENTER(tavor_ah_alloc);
  89 
  90         /*
  91          * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
  92          * indicate that we wish to allocate an "invalid" (i.e. empty)
  93          * address handle XXX
  94          */
  95 
  96         /* Validate that specified port number is legal */
  97         if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
  98                 /* Set "status" and "errormsg" and goto failure */
  99                 TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
 100                 goto ahalloc_fail;
 101         }
 102 
 103         /*
 104          * Allocate a UDAV entry.  This will be filled in with all the
 105          * necessary parameters to define the Address Handle.  Unlike the
 106          * other hardware resources no ownership transfer takes place as
 107          * these UDAV entries are always owned by hardware.
 108          */
 109         status = tavor_rsrc_alloc(state, TAVOR_UDAV, 1, sleepflag, &udav);
 110         if (status != DDI_SUCCESS) {
 111                 /* Set "status" and "errormsg" and goto failure */
 112                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed UDAV");
 113                 goto ahalloc_fail;
 114         }
 115 
 116         /*
 117          * Allocate the software structure for tracking the address handle
 118          * (i.e. the Tavor Address Handle struct).  If we fail here, we must
 119          * undo the previous resource allocation.
 120          */
 121         status = tavor_rsrc_alloc(state, TAVOR_AHHDL, 1, sleepflag, &rsrc);
 122         if (status != DDI_SUCCESS) {
 123                 /* Set "status" and "errormsg" and goto failure */
 124                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed AH handler");
 125                 goto ahalloc_fail1;
 126         }
 127         ah = (tavor_ahhdl_t)rsrc->tr_addr;
 128         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
 129 
 130         /* Increment the reference count on the protection domain (PD) */
 131         tavor_pd_refcnt_inc(pd);
 132 
 133         /*
 134          * Fill in the UDAV entry.  Note: We are only filling in a temporary
 135          * copy here, which we will later copy into the actual entry in
 136          * Tavor DDR memory.  This starts be zeroing out the temporary copy
 137          * and then calling tavor_set_addr_path() to fill in the common
 138          * portions that can be pulled from the "ibt_adds_vect_t" passed in
 139          */
 140         bzero(&udav_entry, sizeof (tavor_hw_udav_t));
 141         status = tavor_set_addr_path(state, attr_p,
 142             (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
 143         if (status != DDI_SUCCESS) {
 144                 tavor_pd_refcnt_dec(pd);
 145                 tavor_rsrc_free(state, &rsrc);
 146                 tavor_rsrc_free(state, &udav);
 147                 /* Set "status" and "errormsg" and goto failure */
 148                 TAVOR_TNF_FAIL(status, "failed in tavor_set_addr_path");
 149                 goto ahalloc_fail;
 150         }
 151         udav_entry.pd     = pd->pd_pdnum;
 152         udav_entry.msg_sz = state->ts_cfg_profile->cp_max_mtu - 1;
 153 
 154         /*
 155          * Register the memory for the UDAV.  The memory for the UDAV must
 156          * be registered in the Tavor TPT tables.  This gives us the LKey
 157          * that we will need when we later post a UD work request that
 158          * uses this address handle.
 159          * We might be able to pre-register all the memory for the UDAV XXX
 160          */
 161         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
 162         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)udav->tr_addr;
 163         mr_attr.mr_len   = udav->tr_len;
 164         mr_attr.mr_as    = NULL;
 165         mr_attr.mr_flags = flag;
 166         op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
 167         op.mro_bind_dmahdl = NULL;
 168         op.mro_bind_override_addr = 0;
 169         status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
 170         if (status != DDI_SUCCESS) {
 171                 /* Set "status" and "errormsg" and goto failure */
 172                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 173                 goto ahalloc_fail2;
 174         }
 175 
 176         /*
 177          * Fill in the UDAV entry.  Here we copy all the information from
 178          * the temporary UDAV into the DDR memory for the real UDAV entry.
 179          * Note that we copy everything but the first 64-bit word.  This
 180          * is where the PD number for the address handle resides.
 181          * By filling everything except the PD and then writing the PD in
 182          * a separate step below, we can ensure that the UDAV is not
 183          * accessed while there are partially written values in it (something
 184          * which really should not happen anyway).  This is guaranteed
 185          * because we take measures to ensure that the PD number is zero for
 186          * all unused UDAV (and because PD#0 is reserved for Tavor).
 187          */
 188         size = sizeof (tavor_hw_udav_t) >> 3;
 189         for (i = 1; i < size; i++) {
 190                 data = ((uint64_t *)&udav_entry)[i];
 191                 ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
 192                     data);
 193         }
 194         data = ((uint64_t *)&udav_entry)[0];
 195         ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, data);
 196 
 197         /*
 198          * Fill in the rest of the Tavor Address Handle struct.  Having
 199          * successfully copied the UDAV into the hardware, we update the
 200          * following fields for use in further operations on the AH.
 201          *
 202          * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
 203          * here because we may need to return it later to the IBTF (as a
 204          * result of a subsequent query operation).  Unlike the other UDAV
 205          * parameters, the value of "av_dgid.gid_guid" is not always preserved
 206          * by being written to hardware.  The reason for this is described in
 207          * tavor_set_addr_path().
 208          */
 209         ah->ah_udavrsrcp = udav;
 210         ah->ah_rsrcp  = rsrc;
 211         ah->ah_pdhdl  = pd;
 212         ah->ah_mrhdl  = mr;
 213         ah->ah_save_guid = attr_p->av_dgid.gid_guid;
 214         ah->ah_save_srate = attr_p->av_srate;
 215         *ahhdl = ah;
 216 
 217         /* Determine if later ddi_dma_sync will be necessary */
 218         ah->ah_sync = TAVOR_UDAV_IS_SYNC_REQ(state);
 219 
 220         /* Sync the UDAV for use by the hardware */
 221         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 222 
 223         TAVOR_TNF_EXIT(tavor_ah_alloc);
 224         return (DDI_SUCCESS);
 225 
 226 ahalloc_fail2:
 227         tavor_pd_refcnt_dec(pd);
 228         tavor_rsrc_free(state, &rsrc);
 229 ahalloc_fail1:
 230         tavor_rsrc_free(state, &udav);
 231 ahalloc_fail:
 232         TNF_PROBE_1(tavor_ah_alloc_fail, TAVOR_TNF_ERROR, "",
 233             tnf_string, msg, errormsg);
 234         TAVOR_TNF_EXIT(tavor_ah_alloc);
 235         return (status);
 236 }
 237 
 238 
 239 /*
 240  * tavor_ah_free()
 241  *    Context: Can be called only from user or kernel context.
 242  */
 243 /* ARGSUSED */
 244 int
 245 tavor_ah_free(tavor_state_t *state, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
 246 {
 247         tavor_rsrc_t            *udav, *rsrc;
 248         tavor_pdhdl_t           pd;
 249         tavor_mrhdl_t           mr;
 250         tavor_ahhdl_t           ah;
 251         int                     status;
 252 
 253         TAVOR_TNF_ENTER(tavor_ah_free);
 254 
 255         /*
 256          * Pull all the necessary information from the Tavor Address Handle
 257          * struct.  This is necessary here because the resource for the
 258          * AH is going to be freed up as part of this operation.
 259          */
 260         ah    = *ahhdl;
 261         mutex_enter(&ah->ah_lock);
 262         udav  = ah->ah_udavrsrcp;
 263         rsrc  = ah->ah_rsrcp;
 264         pd    = ah->ah_pdhdl;
 265         mr    = ah->ah_mrhdl;
 266         mutex_exit(&ah->ah_lock);
 267         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
 268 
 269         /*
 270          * Deregister the memory for the UDAV.  If this fails for any reason,
 271          * then it is an indication that something (either in HW or SW) has
 272          * gone seriously wrong.  So we print a warning message and return
 273          * failure.
 274          */
 275         status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 276             sleepflag);
 277         if (status != DDI_SUCCESS) {
 278                 TNF_PROBE_0(tavor_ah_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
 279                 TAVOR_TNF_EXIT(tavor_ah_free);
 280                 return (ibc_get_ci_failure(0));
 281         }
 282 
 283         /*
 284          * Write zero to the first 64-bit word in the UDAV entry.  As
 285          * described above (in tavor_ah_alloc), the PD number is stored in
 286          * the first 64-bits of each UDAV and setting this to zero is
 287          * guaranteed to invalidate the entry.
 288          */
 289         ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, 0);
 290 
 291         /* Sync the UDAV for use by the hardware */
 292         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 293 
 294         /* Decrement the reference count on the protection domain (PD) */
 295         tavor_pd_refcnt_dec(pd);
 296 
 297         /* Free the Tavor Address Handle structure */
 298         tavor_rsrc_free(state, &rsrc);
 299 
 300         /* Free up the UDAV entry resource */
 301         tavor_rsrc_free(state, &udav);
 302 
 303         /* Set the ahhdl pointer to NULL and return success */
 304         *ahhdl = NULL;
 305 
 306         TAVOR_TNF_EXIT(tavor_ah_free);
 307         return (DDI_SUCCESS);
 308 }
 309 
 310 
 311 /*
 312  * tavor_ah_query()
 313  *    Context: Can be called from interrupt or base context.
 314  */
 315 /* ARGSUSED */
 316 int
 317 tavor_ah_query(tavor_state_t *state, tavor_ahhdl_t ah, tavor_pdhdl_t *pd,
 318     ibt_adds_vect_t *attr_p)
 319 {
 320         tavor_hw_udav_t         udav_entry;
 321         tavor_rsrc_t            *udav;
 322         uint64_t                data;
 323         uint32_t                size;
 324         int                     i;
 325 
 326         TAVOR_TNF_ENTER(tavor_ah_query);
 327 
 328         mutex_enter(&ah->ah_lock);
 329         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
 330 
 331         /*
 332          * Pull all the necessary information from the Tavor Address Handle
 333          * structure
 334          */
 335         udav    = ah->ah_udavrsrcp;
 336         *pd     = ah->ah_pdhdl;
 337 
 338         /*
 339          * Copy the UDAV entry into the temporary copy.  Here we copy all
 340          * the information from the UDAV entry in DDR memory into the
 341          * temporary UDAV.  Note:  We don't need to sync the UDAV for
 342          * reading by software because Tavor HW never modifies the entry.
 343          */
 344         size = sizeof (tavor_hw_udav_t) >> 3;
 345         for (i = 0; i < size; i++) {
 346                 data = ddi_get64(udav->tr_acchdl,
 347                     ((uint64_t *)udav->tr_addr + i));
 348                 ((uint64_t *)&udav_entry)[i] = data;
 349         }
 350 
 351         /*
 352          * Fill in "ibt_adds_vect_t".  We call tavor_get_addr_path() to fill
 353          * the common portions that can be pulled from the UDAV we pass in.
 354          *
 355          * NOTE: We will also fill the "av_dgid.gid_guid" field from the
 356          * "ah_save_guid" field we have previously saved away.  The reason
 357          * for this is described in tavor_ah_alloc() and tavor_ah_modify().
 358          */
 359         tavor_get_addr_path(state, (tavor_hw_addr_path_t *)&udav_entry,
 360             attr_p, TAVOR_ADDRPATH_UDAV, NULL);
 361 
 362         attr_p->av_dgid.gid_guid = ah->ah_save_guid;
 363         attr_p->av_srate = ah->ah_save_srate;
 364 
 365         mutex_exit(&ah->ah_lock);
 366         TAVOR_TNF_EXIT(tavor_ah_query);
 367         return (DDI_SUCCESS);
 368 }
 369 
 370 
 371 /*
 372  * tavor_ah_modify()
 373  *    Context: Can be called from interrupt or base context.
 374  */
 375 /* ARGSUSED */
 376 int
 377 tavor_ah_modify(tavor_state_t *state, tavor_ahhdl_t ah,
 378     ibt_adds_vect_t *attr_p)
 379 {
 380         tavor_hw_udav_t         udav_entry;
 381         tavor_rsrc_t            *udav;
 382         uint64_t                data_new, data_old;
 383         uint32_t                udav_pd, size, portnum_new;
 384         int                     i, status;
 385 
 386         TAVOR_TNF_ENTER(tavor_ah_modify);
 387 
 388         /* Validate that specified port number is legal */
 389         if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
 390                 TNF_PROBE_1(tavor_ah_modify_inv_portnum,
 391                     TAVOR_TNF_ERROR, "", tnf_uint, port, attr_p->av_port_num);
 392                 TAVOR_TNF_EXIT(tavor_ah_modify);
 393                 return (IBT_HCA_PORT_INVALID);
 394         }
 395 
 396         mutex_enter(&ah->ah_lock);
 397 
 398         /*
 399          * Pull all the necessary information from the Tavor Address Handle
 400          * structure
 401          */
 402         udav = ah->ah_udavrsrcp;
 403 
 404         /*
 405          * Fill in the UDAV entry.  Note: we are only filling in a temporary
 406          * copy here, which we will later copy into the actual entry in
 407          * Tavor DDR memory.  This starts be zeroing out the temporary copy
 408          * and then calling tavor_set_addr_path() to fill in the common
 409          * portions that can be pulled from the "ibt_adds_vect_t" passed in
 410          *
 411          * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
 412          * field here (just as we did during tavor_ah_alloc()) because we
 413          * may need to return it later to the IBTF (as a result of a
 414          * subsequent query operation).  As explained in tavor_ah_alloc(),
 415          * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
 416          * is not always preserved by being written to hardware.  The reason
 417          * for this is described in tavor_set_addr_path().
 418          */
 419         bzero(&udav_entry, sizeof (tavor_hw_udav_t));
 420         status = tavor_set_addr_path(state, attr_p,
 421             (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
 422         if (status != DDI_SUCCESS) {
 423                 mutex_exit(&ah->ah_lock);
 424                 TNF_PROBE_0(tavor_ah_modify_setaddrpath_fail,
 425                     TAVOR_TNF_ERROR, "");
 426                 TAVOR_TNF_EXIT(tavor_ah_modify);
 427                 return (status);
 428         }
 429         ah->ah_save_guid = attr_p->av_dgid.gid_guid;
 430         ah->ah_save_srate = attr_p->av_srate;
 431 
 432         /*
 433          * Save away the current PD number for this UDAV.  Then temporarily
 434          * invalidate the entry (by setting the PD to zero).  Note:  Since
 435          * the first 32 bits of the UDAV actually contain the current port
 436          * number _and_ current PD number, we need to mask off some bits.
 437          */
 438         udav_pd = ddi_get32(udav->tr_acchdl, (uint32_t *)udav->tr_addr);
 439         udav_pd = udav_pd & 0xFFFFFF;
 440         ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, 0);
 441 
 442         /* Sync the UDAV for use by the hardware */
 443         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 444 
 445         /*
 446          * Copy UDAV structure to the entry
 447          *    Note:  We copy in 64-bit chunks.  For the first two of these
 448          *    chunks it is necessary to read the current contents of the
 449          *    UDAV, mask off the modifiable portions (maintaining any
 450          *    of the "reserved" portions), and then mask on the new data.
 451          */
 452         size = sizeof (tavor_hw_udav_t) >> 3;
 453         for (i = 0; i < size; i++) {
 454                 data_new = ((uint64_t *)&udav_entry)[i];
 455                 data_old = ddi_get64(udav->tr_acchdl,
 456                     ((uint64_t *)udav->tr_addr + i));
 457 
 458                 /*
 459                  * Apply mask to change only the relevant values.  Note: We
 460                  * extract the new portnum from the address handle here
 461                  * because the "PD" and "portnum" fields are in the same
 462                  * 32-bit word in the UDAV.  We will use the (new) port
 463                  * number extracted here when we write the valid PD number
 464                  * in the last step below.
 465                  */
 466                 if (i == 0) {
 467                         data_old = data_old & TAVOR_UDAV_MODIFY_MASK0;
 468                         portnum_new = data_new >> 56;
 469                 } else if (i == 1) {
 470                         data_old = data_old & TAVOR_UDAV_MODIFY_MASK1;
 471                 } else {
 472                         data_old = 0;
 473                 }
 474 
 475                 /* Write the updated values to the UDAV (in DDR) */
 476                 data_new = data_old | data_new;
 477                 ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
 478                     data_new);
 479         }
 480 
 481         /*
 482          * Sync the body of the UDAV for use by the hardware.  After we
 483          * have updated the PD number (to make the UDAV valid), we sync
 484          * again to push the entire entry out for hardware access.
 485          */
 486         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 487 
 488         /*
 489          * Put the valid PD number back into UDAV entry.  Note: Because port
 490          * number and PD number are in the same word, we must mask the
 491          * new port number with the old PD number before writing it back
 492          * to the UDAV entry
 493          */
 494         udav_pd = ((portnum_new << 24) | udav_pd);
 495         ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, udav_pd);
 496 
 497         /* Sync the rest of the UDAV for use by the hardware */
 498         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 499 
 500         mutex_exit(&ah->ah_lock);
 501         TAVOR_TNF_EXIT(tavor_ah_modify);
 502         return (DDI_SUCCESS);
 503 }
 504 
 505 
 506 /*
 507  * tavor_udav_sync()
 508  *    Context: Can be called from interrupt or base context.
 509  */
 510 /* ARGSUSED */
 511 static void
 512 tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, uint_t flag)
 513 {
 514         ddi_dma_handle_t        dmahdl;
 515         off_t                   offset;
 516         int                     status;
 517 
 518         TAVOR_TNF_ENTER(tavor_udav_sync);
 519 
 520         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
 521 
 522         /* Determine if AH needs to be synced or not */
 523         if (ah->ah_sync == 0) {
 524                 TAVOR_TNF_EXIT(tavor_udav_sync);
 525                 return;
 526         }
 527 
 528         /* Get the DMA handle from AH handle */
 529         dmahdl = ah->ah_mrhdl->mr_bindinfo.bi_dmahdl;
 530 
 531         /* Calculate offset into address handle */
 532         offset = (off_t)0;
 533         status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_udav_t), flag);
 534         if (status != DDI_SUCCESS) {
 535                 TNF_PROBE_0(tavor_udav_sync_getnextentry_fail,
 536                     TAVOR_TNF_ERROR, "");
 537                 TAVOR_TNF_EXIT(tavor_udav_sync);
 538                 return;
 539         }
 540 
 541         TAVOR_TNF_EXIT(tavor_udav_sync);
 542 }
 543 
 544 
 545 /*
 546  * tavor_mcg_attach()
 547  *    Context: Can be called only from user or kernel context.
 548  */
 549 int
 550 tavor_mcg_attach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
 551     ib_lid_t lid)
 552 {
 553         tavor_rsrc_t            *rsrc;
 554         tavor_hw_mcg_t          *mcg_entry;
 555         tavor_hw_mcg_qp_list_t  *mcg_entry_qplist;
 556         tavor_mcghdl_t          mcg, newmcg;
 557         uint64_t                mgid_hash;
 558         uint32_t                end_indx;
 559         int                     status;
 560         uint_t                  qp_found;
 561         char                    *errormsg;
 562 
 563         TAVOR_TNF_ENTER(tavor_mcg_attach);
 564 
 565         /*
 566          * It is only allowed to attach MCG to UD queue pairs.  Verify
 567          * that the intended QP is of the appropriate transport type
 568          */
 569         if (qp->qp_serv_type != TAVOR_QP_UD) {
 570                 /* Set "status" and "errormsg" and goto failure */
 571                 TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid service type");
 572                 goto mcgattach_fail;
 573         }
 574 
 575         /*
 576          * Check for invalid Multicast DLID.  Specifically, all Multicast
 577          * LIDs should be within a well defined range.  If the specified LID
 578          * is outside of that range, then return an error.
 579          */
 580         if (tavor_mlid_is_valid(lid) == 0) {
 581                 /* Set "status" and "errormsg" and goto failure */
 582                 TAVOR_TNF_FAIL(IBT_MC_MLID_INVALID, "invalid MLID");
 583                 goto mcgattach_fail;
 584         }
 585         /*
 586          * Check for invalid Multicast GID.  All Multicast GIDs should have
 587          * a well-defined pattern of bits and flags that are allowable.  If
 588          * the specified GID does not meet the criteria, then return an error.
 589          */
 590         if (tavor_mgid_is_valid(gid) == 0) {
 591                 /* Set "status" and "errormsg" and goto failure */
 592                 TAVOR_TNF_FAIL(IBT_MC_MGID_INVALID, "invalid MGID");
 593                 goto mcgattach_fail;
 594         }
 595 
 596         /*
 597          * Compute the MGID hash value.  Since the MCG table is arranged as
 598          * a number of separate hash chains, this operation converts the
 599          * specified MGID into the starting index of an entry in the hash
 600          * table (i.e. the index for the start of the appropriate hash chain).
 601          * Subsequent operations below will walk the chain searching for the
 602          * right place to add this new QP.
 603          */
 604         status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
 605             &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
 606         if (status != TAVOR_CMD_SUCCESS) {
 607                 cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
 608                     status);
 609                 TNF_PROBE_1(tavor_mcg_attach_mgid_hash_cmd_fail,
 610                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
 611                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 612                 return (ibc_get_ci_failure(0));
 613         }
 614 
 615         /*
 616          * Grab the multicast group mutex.  Then grab the pre-allocated
 617          * temporary buffer used for holding and/or modifying MCG entries.
 618          * Zero out the temporary MCG entry before we begin.
 619          */
 620         mutex_enter(&state->ts_mcglock);
 621         mcg_entry = state->ts_mcgtmp;
 622         mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
 623         bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
 624 
 625         /*
 626          * Walk through the array of MCG entries starting at "mgid_hash".
 627          * Try to find the appropriate place for this new QP to be added.
 628          * This could happen when the first entry of the chain has MGID == 0
 629          * (which means that the hash chain is empty), or because we find
 630          * an entry with the same MGID (in which case we'll add the QP to
 631          * that MCG), or because we come to the end of the chain (in which
 632          * case this is the first QP being added to the multicast group that
 633          * corresponds to the MGID.  The tavor_mcg_walk_mgid_hash() routine
 634          * walks the list and returns an index into the MCG table.  The entry
 635          * at this index is then checked to determine which case we have
 636          * fallen into (see below).  Note:  We are using the "shadow" MCG
 637          * list (of tavor_mcg_t structs) for this lookup because the real
 638          * MCG entries are in hardware (and the lookup process would be much
 639          * more time consuming).
 640          */
 641         end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
 642         mcg      = &state->ts_mcghdl[end_indx];
 643 
 644         /*
 645          * If MGID == 0, then the hash chain is empty.  Just fill in the
 646          * current entry.  Note:  No need to allocate an MCG table entry
 647          * as all the hash chain "heads" are already preallocated.
 648          */
 649         if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
 650 
 651                 /* Fill in the current entry in the "shadow" MCG list */
 652                 tavor_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
 653 
 654                 /*
 655                  * Try to add the new QP number to the list.  This (and the
 656                  * above) routine fills in a temporary MCG.  The "mcg_entry"
 657                  * and "mcg_entry_qplist" pointers simply point to different
 658                  * offsets within the same temporary copy of the MCG (for
 659                  * convenience).  Note:  If this fails, we need to invalidate
 660                  * the entries we've already put into the "shadow" list entry
 661                  * above.
 662                  */
 663                 status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
 664                     &qp_found);
 665                 if (status != DDI_SUCCESS) {
 666                         bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
 667                         mutex_exit(&state->ts_mcglock);
 668                         /* Set "status" and "errormsg" and goto failure */
 669                         TAVOR_TNF_FAIL(status, "failed qplist add");
 670                         goto mcgattach_fail;
 671                 }
 672 
 673                 /*
 674                  * Once the temporary MCG has been filled in, write the entry
 675                  * into the appropriate location in the Tavor MCG entry table.
 676                  * If it's successful, then drop the lock and return success.
 677                  * Note: In general, this operation shouldn't fail.  If it
 678                  * does, then it is an indication that something (probably in
 679                  * HW, but maybe in SW) has gone seriously wrong.  We still
 680                  * want to zero out the entries that we've filled in above
 681                  * (in the tavor_mcg_setup_new_hdr() routine).
 682                  */
 683                 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
 684                     TAVOR_CMD_NOSLEEP_SPIN);
 685                 if (status != TAVOR_CMD_SUCCESS) {
 686                         bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
 687                         mutex_exit(&state->ts_mcglock);
 688                         TAVOR_WARNING(state, "failed to write MCG entry");
 689                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
 690                             "%08x\n", status);
 691                         TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 692                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 693                             tnf_uint, indx, end_indx);
 694                         TAVOR_TNF_EXIT(tavor_mcg_attach);
 695                         return (ibc_get_ci_failure(0));
 696                 }
 697 
 698                 /*
 699                  * Now that we know all the Tavor firmware accesses have been
 700                  * successful, we update the "shadow" MCG entry by incrementing
 701                  * the "number of attached QPs" count.
 702                  *
 703                  * We increment only if the QP is not already part of the
 704                  * MCG by checking the 'qp_found' flag returned from the
 705                  * qplist_add above.
 706                  */
 707                 if (!qp_found) {
 708                         mcg->mcg_num_qps++;
 709 
 710                         /*
 711                          * Increment the refcnt for this QP.  Because the QP
 712                          * was added to this MCG, the refcnt must be
 713                          * incremented.
 714                          */
 715                         tavor_qp_mcg_refcnt_inc(qp);
 716                 }
 717 
 718                 /*
 719                  * We drop the lock and return success.
 720                  */
 721                 mutex_exit(&state->ts_mcglock);
 722                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 723                 return (DDI_SUCCESS);
 724         }
 725 
 726         /*
 727          * If the specified MGID matches the MGID in the current entry, then
 728          * we need to try to add the QP to the current MCG entry.  In this
 729          * case, it means that we need to read the existing MCG entry (into
 730          * the temporary MCG), add the new QP number to the temporary entry
 731          * (using the same method we used above), and write the entry back
 732          * to the hardware (same as above).
 733          */
 734         if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
 735             (mcg->mcg_mgid_l == gid.gid_guid)) {
 736 
 737                 /*
 738                  * Read the current MCG entry into the temporary MCG.  Note:
 739                  * In general, this operation shouldn't fail.  If it does,
 740                  * then it is an indication that something (probably in HW,
 741                  * but maybe in SW) has gone seriously wrong.
 742                  */
 743                 status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
 744                     TAVOR_CMD_NOSLEEP_SPIN);
 745                 if (status != TAVOR_CMD_SUCCESS) {
 746                         mutex_exit(&state->ts_mcglock);
 747                         TAVOR_WARNING(state, "failed to read MCG entry");
 748                         cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
 749                             "%08x\n", status);
 750                         TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
 751                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 752                             tnf_uint, indx, end_indx);
 753                         TAVOR_TNF_EXIT(tavor_mcg_attach);
 754                         return (ibc_get_ci_failure(0));
 755                 }
 756 
 757                 /*
 758                  * Try to add the new QP number to the list.  This routine
 759                  * fills in the necessary pieces of the temporary MCG.  The
 760                  * "mcg_entry_qplist" pointer is used to point to the portion
 761                  * of the temporary MCG that holds the QP numbers.
 762                  *
 763                  * Note: tavor_mcg_qplist_add() returns SUCCESS if it
 764                  * already found the QP in the list.  In this case, the QP is
 765                  * not added on to the list again.  Check the flag 'qp_found'
 766                  * if this value is needed to be known.
 767                  *
 768                  */
 769                 status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
 770                     &qp_found);
 771                 if (status != DDI_SUCCESS) {
 772                         mutex_exit(&state->ts_mcglock);
 773                         /* Set "status" and "errormsg" and goto failure */
 774                         TAVOR_TNF_FAIL(status, "failed qplist add");
 775                         goto mcgattach_fail;
 776                 }
 777 
 778                 /*
 779                  * Once the temporary MCG has been updated, write the entry
 780                  * into the appropriate location in the Tavor MCG entry table.
 781                  * If it's successful, then drop the lock and return success.
 782                  * Note: In general, this operation shouldn't fail.  If it
 783                  * does, then it is an indication that something (probably in
 784                  * HW, but maybe in SW) has gone seriously wrong.
 785                  */
 786                 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
 787                     TAVOR_CMD_NOSLEEP_SPIN);
 788                 if (status != TAVOR_CMD_SUCCESS) {
 789                         mutex_exit(&state->ts_mcglock);
 790                         TAVOR_WARNING(state, "failed to write MCG entry");
 791                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
 792                             "%08x\n", status);
 793                         TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 794                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 795                             tnf_uint, indx, end_indx);
 796                         TAVOR_TNF_EXIT(tavor_mcg_attach);
 797                         return (ibc_get_ci_failure(0));
 798                 }
 799 
 800                 /*
 801                  * Now that we know all the Tavor firmware accesses have been
 802                  * successful, we update the current "shadow" MCG entry by
 803                  * incrementing the "number of attached QPs" count.
 804                  *
 805                  * We increment only if the QP is not already part of the
 806                  * MCG by checking the 'qp_found' flag returned from the
 807                  * qplist_add above.
 808                  */
 809                 if (!qp_found) {
 810                         mcg->mcg_num_qps++;
 811 
 812                         /*
 813                          * Increment the refcnt for this QP.  Because the QP
 814                          * was added to this MCG, the refcnt must be
 815                          * incremented.
 816                          */
 817                         tavor_qp_mcg_refcnt_inc(qp);
 818                 }
 819 
 820                 /*
 821                  * We drop the lock and return success.
 822                  */
 823                 mutex_exit(&state->ts_mcglock);
 824                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 825                 return (DDI_SUCCESS);
 826         }
 827 
 828         /*
 829          * If we've reached here, then we're at the end of the hash chain.
 830          * We need to allocate a new MCG entry, fill it in, write it to Tavor,
 831          * and update the previous entry to link the new one to the end of the
 832          * chain.
 833          */
 834 
 835         /*
 836          * Allocate an MCG table entry.  This will be filled in with all
 837          * the necessary parameters to define the multicast group.  Then it
 838          * will be written to the hardware in the next-to-last step below.
 839          */
 840         status = tavor_rsrc_alloc(state, TAVOR_MCG, 1, TAVOR_NOSLEEP, &rsrc);
 841         if (status != DDI_SUCCESS) {
 842                 mutex_exit(&state->ts_mcglock);
 843                 /* Set "status" and "errormsg" and goto failure */
 844                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MCG");
 845                 goto mcgattach_fail;
 846         }
 847 
 848         /*
 849          * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
 850          * it does above, tavor_mcg_setup_new_hdr() also fills in a portion
 851          * of the temporary MCG entry (the rest of which will be filled in by
 852          * tavor_mcg_qplist_add() below)
 853          */
 854         newmcg = &state->ts_mcghdl[rsrc->tr_indx];
 855         tavor_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
 856 
 857         /*
 858          * Try to add the new QP number to the list.  This routine fills in
 859          * the final necessary pieces of the temporary MCG.  The
 860          * "mcg_entry_qplist" pointer is used to point to the portion of the
 861          * temporary MCG that holds the QP numbers.  If we fail here, we
 862          * must undo the previous resource allocation.
 863          *
 864          * Note: tavor_mcg_qplist_add() can we return SUCCESS if it already
 865          * found the QP in the list.  In this case, the QP is not added on to
 866          * the list again.  Check the flag 'qp_found' if this value is needed
 867          * to be known.
 868          */
 869         status = tavor_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
 870             &qp_found);
 871         if (status != DDI_SUCCESS) {
 872                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 873                 tavor_rsrc_free(state, &rsrc);
 874                 mutex_exit(&state->ts_mcglock);
 875                 /* Set "status" and "errormsg" and goto failure */
 876                 TAVOR_TNF_FAIL(status, "failed qplist add");
 877                 goto mcgattach_fail;
 878         }
 879 
 880         /*
 881          * Once the temporary MCG has been updated, write the entry into the
 882          * appropriate location in the Tavor MCG entry table.  If this is
 883          * successful, then we need to chain the previous entry to this one.
 884          * Note: In general, this operation shouldn't fail.  If it does, then
 885          * it is an indication that something (probably in HW, but maybe in
 886          * SW) has gone seriously wrong.
 887          */
 888         status = tavor_write_mgm_cmd_post(state, mcg_entry, rsrc->tr_indx,
 889             TAVOR_CMD_NOSLEEP_SPIN);
 890         if (status != TAVOR_CMD_SUCCESS) {
 891                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 892                 tavor_rsrc_free(state, &rsrc);
 893                 mutex_exit(&state->ts_mcglock);
 894                 TAVOR_WARNING(state, "failed to write MCG entry");
 895                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
 896                     status);
 897                 TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 898                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 899                     tnf_uint, indx, rsrc->tr_indx);
 900                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 901                 return (ibc_get_ci_failure(0));
 902         }
 903 
 904         /*
 905          * Now read the current MCG entry (the one previously at the end of
 906          * hash chain) into the temporary MCG.  We are going to update its
 907          * "next_gid_indx" now and write the entry back to the MCG table.
 908          * Note:  In general, this operation shouldn't fail.  If it does, then
 909          * it is an indication that something (probably in HW, but maybe in SW)
 910          * has gone seriously wrong.  We will free up the MCG entry resource,
 911          * but we will not undo the previously written MCG entry in the HW.
 912          * This is OK, though, because the MCG entry is not currently attached
 913          * to any hash chain.
 914          */
 915         status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
 916             TAVOR_CMD_NOSLEEP_SPIN);
 917         if (status != TAVOR_CMD_SUCCESS) {
 918                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 919                 tavor_rsrc_free(state, &rsrc);
 920                 mutex_exit(&state->ts_mcglock);
 921                 TAVOR_WARNING(state, "failed to read MCG entry");
 922                 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
 923                     status);
 924                 TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
 925                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 926                     tnf_uint, indx, end_indx);
 927                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 928                 return (ibc_get_ci_failure(0));
 929         }
 930 
 931         /*
 932          * Finally, we update the "next_gid_indx" field in the temporary MCG
 933          * and attempt to write the entry back into the Tavor MCG table.  If
 934          * this succeeds, then we update the "shadow" list to reflect the
 935          * change, drop the lock, and return success.  Note:  In general, this
 936          * operation shouldn't fail.  If it does, then it is an indication
 937          * that something (probably in HW, but maybe in SW) has gone seriously
 938          * wrong.  Just as we do above, we will free up the MCG entry resource,
 939          * but we will not try to undo the previously written MCG entry.  This
 940          * is OK, though, because (since we failed here to update the end of
 941          * the chain) that other entry is not currently attached to any chain.
 942          */
 943         mcg_entry->next_gid_indx = rsrc->tr_indx;
 944         status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
 945             TAVOR_CMD_NOSLEEP_SPIN);
 946         if (status != TAVOR_CMD_SUCCESS) {
 947                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 948                 tavor_rsrc_free(state, &rsrc);
 949                 mutex_exit(&state->ts_mcglock);
 950                 TAVOR_WARNING(state, "failed to write MCG entry");
 951                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
 952                     status);
 953                 TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 954                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 955                     tnf_uint, indx, end_indx);
 956                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 957                 return (ibc_get_ci_failure(0));
 958         }
 959         mcg = &state->ts_mcghdl[end_indx];
 960         mcg->mcg_next_indx = rsrc->tr_indx;
 961 
 962         /*
 963          * Now that we know all the Tavor firmware accesses have been
 964          * successful, we update the new "shadow" MCG entry by incrementing
 965          * the "number of attached QPs" count.  Then we drop the lock and
 966          * return success.
 967          */
 968         newmcg->mcg_num_qps++;
 969 
 970         /*
 971          * Increment the refcnt for this QP.  Because the QP
 972          * was added to this MCG, the refcnt must be
 973          * incremented.
 974          */
 975         tavor_qp_mcg_refcnt_inc(qp);
 976 
 977         mutex_exit(&state->ts_mcglock);
 978         TAVOR_TNF_EXIT(tavor_mcg_attach);
 979         return (DDI_SUCCESS);
 980 
 981 mcgattach_fail:
 982         TNF_PROBE_1(tavor_mcg_attach_fail, TAVOR_TNF_ERROR, "", tnf_string,
 983             msg, errormsg);
 984         TAVOR_TNF_EXIT(tavor_mcg_attach);
 985         return (status);
 986 }
 987 
 988 
 989 /*
 990  * tavor_mcg_detach()
 991  *    Context: Can be called only from user or kernel context.
 992  */
 993 int
 994 tavor_mcg_detach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
 995     ib_lid_t lid)
 996 {
 997         tavor_hw_mcg_t          *mcg_entry;
 998         tavor_hw_mcg_qp_list_t  *mcg_entry_qplist;
 999         tavor_mcghdl_t          mcg;
1000         uint64_t                mgid_hash;
1001         uint32_t                end_indx, prev_indx;
1002         int                     status;
1003 
1004         TAVOR_TNF_ENTER(tavor_mcg_detach);
1005 
1006         /*
1007          * Check for invalid Multicast DLID.  Specifically, all Multicast
1008          * LIDs should be within a well defined range.  If the specified LID
1009          * is outside of that range, then return an error.
1010          */
1011         if (tavor_mlid_is_valid(lid) == 0) {
1012                 TNF_PROBE_0(tavor_mcg_detach_invmlid_fail, TAVOR_TNF_ERROR, "");
1013                 TAVOR_TNF_EXIT(tavor_mcg_detach);
1014                 return (IBT_MC_MLID_INVALID);
1015         }
1016 
1017         /*
1018          * Compute the MGID hash value.  As described above, the MCG table is
1019          * arranged as a number of separate hash chains.  This operation
1020          * converts the specified MGID into the starting index of an entry in
1021          * the hash table (i.e. the index for the start of the appropriate
1022          * hash chain).  Subsequent operations below will walk the chain
1023          * searching for a matching entry from which to attempt to remove
1024          * the specified QP.
1025          */
1026         status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1027             &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
1028         if (status != TAVOR_CMD_SUCCESS) {
1029                 cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
1030                     status);
1031                 TNF_PROBE_1(tavor_mcg_detach_mgid_hash_cmd_fail,
1032                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1033                 TAVOR_TNF_EXIT(tavor_mcg_attach);
1034                 return (ibc_get_ci_failure(0));
1035         }
1036 
1037         /*
1038          * Grab the multicast group mutex.  Then grab the pre-allocated
1039          * temporary buffer used for holding and/or modifying MCG entries.
1040          */
1041         mutex_enter(&state->ts_mcglock);
1042         mcg_entry = state->ts_mcgtmp;
1043         mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
1044 
1045         /*
1046          * Walk through the array of MCG entries starting at "mgid_hash".
1047          * Try to find an MCG entry with a matching MGID.  The
1048          * tavor_mcg_walk_mgid_hash() routine walks the list and returns an
1049          * index into the MCG table.  The entry at this index is checked to
1050          * determine whether it is a match or not.  If it is a match, then
1051          * we continue on to attempt to remove the QP from the MCG.  If it
1052          * is not a match (or not a valid MCG entry), then we return an error.
1053          */
1054         end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1055         mcg      = &state->ts_mcghdl[end_indx];
1056 
1057         /*
1058          * If MGID == 0 (the hash chain is empty) or if the specified MGID
1059          * does not match the MGID in the current entry, then return
1060          * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1061          * valid).
1062          */
1063         if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1064             ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1065             (mcg->mcg_mgid_l != gid.gid_guid))) {
1066                 mutex_exit(&state->ts_mcglock);
1067                 TNF_PROBE_0(tavor_mcg_detach_invmgid_fail, TAVOR_TNF_ERROR, "");
1068                 TAVOR_TNF_EXIT(tavor_mcg_detach);
1069                 return (IBT_MC_MGID_INVALID);
1070         }
1071 
1072         /*
1073          * Read the current MCG entry into the temporary MCG.  Note: In
1074          * general, this operation shouldn't fail.  If it does, then it is
1075          * an indication that something (probably in HW, but maybe in SW)
1076          * has gone seriously wrong.
1077          */
1078         status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
1079             TAVOR_CMD_NOSLEEP_SPIN);
1080         if (status != TAVOR_CMD_SUCCESS) {
1081                 mutex_exit(&state->ts_mcglock);
1082                 TAVOR_WARNING(state, "failed to read MCG entry");
1083                 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1084                     status);
1085                 TNF_PROBE_2(tavor_mcg_detach_read_mgm_cmd_fail,
1086                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1087                     tnf_uint, indx, end_indx);
1088                 TAVOR_TNF_EXIT(tavor_mcg_attach);
1089                 return (ibc_get_ci_failure(0));
1090         }
1091 
1092         /*
1093          * Search the QP number list for a match.  If a match is found, then
1094          * remove the entry from the QP list.  Otherwise, if no match is found,
1095          * return an error.
1096          */
1097         status = tavor_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1098         if (status != DDI_SUCCESS) {
1099                 mutex_exit(&state->ts_mcglock);
1100                 TAVOR_TNF_EXIT(tavor_mcg_detach);
1101                 return (status);
1102         }
1103 
1104         /*
1105          * Decrement the MCG count for this QP.  When the 'qp_mcg'
1106          * field becomes 0, then this QP is no longer a member of any
1107          * MCG.
1108          */
1109         tavor_qp_mcg_refcnt_dec(qp);
1110 
1111         /*
1112          * If the current MCG's QP number list is about to be made empty
1113          * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1114          * chain.  Otherwise, just write the updated MCG entry back to the
1115          * hardware.  In either case, once we successfully update the hardware
1116          * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1117          * count (or zero out the entire "shadow" list entry) before returning
1118          * success.  Note:  Zeroing out the "shadow" list entry is done
1119          * inside of tavor_mcg_hash_list_remove().
1120          */
1121         if (mcg->mcg_num_qps == 1) {
1122 
1123                 /* Remove an MCG entry from the hash chain */
1124                 status = tavor_mcg_hash_list_remove(state, end_indx, prev_indx,
1125                     mcg_entry);
1126                 if (status != DDI_SUCCESS) {
1127                         mutex_exit(&state->ts_mcglock);
1128                         TAVOR_TNF_EXIT(tavor_mcg_detach);
1129                         return (status);
1130                 }
1131 
1132         } else {
1133                 /*
1134                  * Write the updated MCG entry back to the Tavor MCG table.
1135                  * If this succeeds, then we update the "shadow" list to
1136                  * reflect the change (i.e. decrement the "mcg_num_qps"),
1137                  * drop the lock, and return success.  Note:  In general,
1138                  * this operation shouldn't fail.  If it does, then it is an
1139                  * indication that something (probably in HW, but maybe in SW)
1140                  * has gone seriously wrong.
1141                  */
1142                 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
1143                     TAVOR_CMD_NOSLEEP_SPIN);
1144                 if (status != TAVOR_CMD_SUCCESS) {
1145                         mutex_exit(&state->ts_mcglock);
1146                         TAVOR_WARNING(state, "failed to write MCG entry");
1147                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1148                             "%08x\n", status);
1149                         TNF_PROBE_2(tavor_mcg_detach_write_mgm_cmd_fail,
1150                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1151                             tnf_uint, indx, end_indx);
1152                         TAVOR_TNF_EXIT(tavor_mcg_detach);
1153                         return (ibc_get_ci_failure(0));
1154                 }
1155                 mcg->mcg_num_qps--;
1156         }
1157 
1158         mutex_exit(&state->ts_mcglock);
1159         TAVOR_TNF_EXIT(tavor_mcg_detach);
1160         return (DDI_SUCCESS);
1161 }
1162 
1163 /*
1164  * tavor_qp_mcg_refcnt_inc()
1165  *    Context: Can be called from interrupt or base context.
1166  */
1167 static void
1168 tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp)
1169 {
1170         /* Increment the QP's MCG reference count */
1171         mutex_enter(&qp->qp_lock);
1172         qp->qp_mcg_refcnt++;
1173         TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_inc, TAVOR_TNF_TRACE, "",
1174             tnf_uint, refcnt, qp->qp_mcg_refcnt);
1175         mutex_exit(&qp->qp_lock);
1176 }
1177 
1178 
1179 /*
1180  * tavor_qp_mcg_refcnt_dec()
1181  *    Context: Can be called from interrupt or base context.
1182  */
1183 static void
1184 tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp)
1185 {
1186         /* Decrement the QP's MCG reference count */
1187         mutex_enter(&qp->qp_lock);
1188         qp->qp_mcg_refcnt--;
1189         TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_dec, TAVOR_TNF_TRACE, "",
1190             tnf_uint, refcnt, qp->qp_mcg_refcnt);
1191         mutex_exit(&qp->qp_lock);
1192 }
1193 
1194 
1195 /*
1196  * tavor_mcg_qplist_add()
1197  *    Context: Can be called from interrupt or base context.
1198  */
1199 static int
1200 tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
1201     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp,
1202     uint_t *qp_found)
1203 {
1204         uint_t          qplist_indx;
1205 
1206         TAVOR_TNF_ENTER(tavor_mcg_qplist_add);
1207 
1208         ASSERT(MUTEX_HELD(&state->ts_mcglock));
1209 
1210         qplist_indx = mcg->mcg_num_qps;
1211 
1212         /*
1213          * Determine if we have exceeded the maximum number of QP per
1214          * multicast group.  If we have, then return an error
1215          */
1216         if (qplist_indx >= state->ts_cfg_profile->cp_num_qp_per_mcg) {
1217                 TNF_PROBE_0(tavor_mcg_qplist_add_too_many_qps,
1218                     TAVOR_TNF_ERROR, "");
1219                 TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1220                 return (IBT_HCA_MCG_QP_EXCEEDED);
1221         }
1222 
1223         /*
1224          * Determine if the QP is already attached to this MCG table.  If it
1225          * is, then we break out and treat this operation as a NO-OP
1226          */
1227         for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1228             qplist_indx++) {
1229                 if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1230                         break;
1231                 }
1232         }
1233 
1234         /*
1235          * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1236          * return SUCCESS in this case, but the qplist will not have been
1237          * updated because the QP was already on the list.
1238          */
1239         if (qplist_indx < mcg->mcg_num_qps) {
1240                 *qp_found = 1;
1241         } else {
1242                 /*
1243                  * Otherwise, append the new QP number to the end of the
1244                  * current QP list.  Note: We will increment the "mcg_num_qps"
1245                  * field on the "shadow" MCG list entry later (after we know
1246                  * that all necessary Tavor firmware accesses have been
1247                  * successful).
1248                  *
1249                  * Set 'qp_found' to 0 so we know the QP was added on to the
1250                  * list for sure.
1251                  */
1252                 mcg_qplist[qplist_indx].q   = TAVOR_MCG_QPN_VALID;
1253                 mcg_qplist[qplist_indx].qpn = qp->qp_qpnum;
1254                 *qp_found = 0;
1255         }
1256 
1257         TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1258         return (DDI_SUCCESS);
1259 }
1260 
1261 
1262 
1263 /*
1264  * tavor_mcg_qplist_remove()
1265  *    Context: Can be called from interrupt or base context.
1266  */
1267 static int
1268 tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, tavor_hw_mcg_qp_list_t *mcg_qplist,
1269     tavor_qphdl_t qp)
1270 {
1271         uint_t          i, qplist_indx;
1272 
1273         TAVOR_TNF_ENTER(tavor_mcg_qplist_remove);
1274 
1275         /*
1276          * Search the MCG QP list for a matching QPN.  When
1277          * it's found, we swap the last entry with the current
1278          * one, set the last entry to zero, decrement the last
1279          * entry, and return.  If it's not found, then it's
1280          * and error.
1281          */
1282         qplist_indx = mcg->mcg_num_qps;
1283         for (i = 0; i < qplist_indx; i++) {
1284                 if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1285                         mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1286                         mcg_qplist[qplist_indx - 1].q = TAVOR_MCG_QPN_INVALID;
1287                         mcg_qplist[qplist_indx - 1].qpn = 0;
1288 
1289                         TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1290                         return (DDI_SUCCESS);
1291                 }
1292         }
1293 
1294         TNF_PROBE_0(tavor_mcg_qplist_remove_invqphdl_fail, TAVOR_TNF_ERROR, "");
1295         TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1296         return (IBT_QP_HDL_INVALID);
1297 }
1298 
1299 
1300 /*
1301  * tavor_mcg_walk_mgid_hash()
1302  *    Context: Can be called from interrupt or base context.
1303  */
1304 static uint_t
1305 tavor_mcg_walk_mgid_hash(tavor_state_t *state, uint64_t start_indx,
1306     ib_gid_t mgid, uint_t *p_indx)
1307 {
1308         tavor_mcghdl_t  curr_mcghdl;
1309         uint_t          curr_indx, prev_indx;
1310 
1311         TAVOR_TNF_ENTER(tavor_mcg_walk_mgid_hash);
1312 
1313         ASSERT(MUTEX_HELD(&state->ts_mcglock));
1314 
1315         /* Start at the head of the hash chain */
1316         curr_indx   = start_indx;
1317         prev_indx   = curr_indx;
1318         curr_mcghdl = &state->ts_mcghdl[curr_indx];
1319 
1320         /* If the first entry in the chain has MGID == 0, then stop */
1321         if ((curr_mcghdl->mcg_mgid_h == 0) &&
1322             (curr_mcghdl->mcg_mgid_l == 0)) {
1323                 goto end_mgid_hash_walk;
1324         }
1325 
1326         /* If the first entry in the chain matches the MGID, then stop */
1327         if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1328             (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1329                 goto end_mgid_hash_walk;
1330         }
1331 
1332         /* Otherwise, walk the hash chain looking for a match */
1333         while (curr_mcghdl->mcg_next_indx != 0) {
1334                 prev_indx = curr_indx;
1335                 curr_indx = curr_mcghdl->mcg_next_indx;
1336                 curr_mcghdl = &state->ts_mcghdl[curr_indx];
1337 
1338                 if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1339                     (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1340                         break;
1341                 }
1342         }
1343 
1344 end_mgid_hash_walk:
1345         /*
1346          * If necessary, return the index of the previous entry too.  This
1347          * is primarily used for detaching a QP from a multicast group.  It
1348          * may be necessary, in that case, to delete an MCG entry from the
1349          * hash chain and having the index of the previous entry is helpful.
1350          */
1351         if (p_indx != NULL) {
1352                 *p_indx = prev_indx;
1353         }
1354         TAVOR_TNF_EXIT(tavor_mcg_walk_mgid_hash);
1355         return (curr_indx);
1356 }
1357 
1358 
1359 /*
1360  * tavor_mcg_setup_new_hdr()
1361  *    Context: Can be called from interrupt or base context.
1362  */
1363 static void
1364 tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, tavor_hw_mcg_t *mcg_hdr,
1365     ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc)
1366 {
1367         TAVOR_TNF_ENTER(tavor_mcg_setup_new_hdr);
1368 
1369         /*
1370          * Fill in the fields of the "shadow" entry used by software
1371          * to track MCG hardware entry
1372          */
1373         mcg->mcg_mgid_h         = mgid.gid_prefix;
1374         mcg->mcg_mgid_l         = mgid.gid_guid;
1375         mcg->mcg_rsrcp          = mcg_rsrc;
1376         mcg->mcg_next_indx = 0;
1377         mcg->mcg_num_qps   = 0;
1378 
1379         /*
1380          * Fill the header fields of the MCG entry (in the temporary copy)
1381          */
1382         mcg_hdr->mgid_h              = mgid.gid_prefix;
1383         mcg_hdr->mgid_l              = mgid.gid_guid;
1384         mcg_hdr->next_gid_indx       = 0;
1385 
1386         TAVOR_TNF_EXIT(tavor_mcg_setup_new_hdr);
1387 }
1388 
1389 
1390 /*
1391  * tavor_mcg_hash_list_remove()
1392  *    Context: Can be called only from user or kernel context.
1393  */
1394 static int
1395 tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
1396     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry)
1397 {
1398         tavor_mcghdl_t          curr_mcg, prev_mcg, next_mcg;
1399         uint_t                  next_indx;
1400         int                     status;
1401 
1402         /* Get the pointer to "shadow" list for current entry */
1403         curr_mcg = &state->ts_mcghdl[curr_indx];
1404 
1405         /*
1406          * If this is the first entry on a hash chain, then attempt to replace
1407          * the entry with the next entry on the chain.  If there are no
1408          * subsequent entries on the chain, then this is the only entry and
1409          * should be invalidated.
1410          */
1411         if (curr_indx == prev_indx) {
1412 
1413                 /*
1414                  * If this is the only entry on the chain, then invalidate it.
1415                  * Note:  Invalidating an MCG entry means writing all zeros
1416                  * to the entry.  This is only necessary for those MCG
1417                  * entries that are the "head" entries of the individual hash
1418                  * chains.  Regardless of whether this operation returns
1419                  * success or failure, return that result to the caller.
1420                  */
1421                 next_indx = curr_mcg->mcg_next_indx;
1422                 if (next_indx == 0) {
1423                         status = tavor_mcg_entry_invalidate(state, mcg_entry,
1424                             curr_indx);
1425                         bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1426                         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1427                         return (status);
1428                 }
1429 
1430                 /*
1431                  * Otherwise, this is just the first entry on the chain, so
1432                  * grab the next one
1433                  */
1434                 next_mcg = &state->ts_mcghdl[next_indx];
1435 
1436                 /*
1437                  * Read the next MCG entry into the temporary MCG.  Note:
1438                  * In general, this operation shouldn't fail.  If it does,
1439                  * then it is an indication that something (probably in HW,
1440                  * but maybe in SW) has gone seriously wrong.
1441                  */
1442                 status = tavor_read_mgm_cmd_post(state, mcg_entry, next_indx,
1443                     TAVOR_CMD_NOSLEEP_SPIN);
1444                 if (status != TAVOR_CMD_SUCCESS) {
1445                         TAVOR_WARNING(state, "failed to read MCG entry");
1446                         cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
1447                             "%08x\n", status);
1448                         TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1449                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1450                             tnf_uint, indx, next_indx);
1451                         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1452                         return (ibc_get_ci_failure(0));
1453                 }
1454 
1455                 /*
1456                  * Copy/Write the temporary MCG back to the hardware MCG list
1457                  * using the current index.  This essentially removes the
1458                  * current MCG entry from the list by writing over it with
1459                  * the next one.  If this is successful, then we can do the
1460                  * same operation for the "shadow" list.  And we can also
1461                  * free up the Tavor MCG entry resource that was associated
1462                  * with the (old) next entry.  Note:  In general, this
1463                  * operation shouldn't fail.  If it does, then it is an
1464                  * indication that something (probably in HW, but maybe in SW)
1465                  * has gone seriously wrong.
1466                  */
1467                 status = tavor_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1468                     TAVOR_CMD_NOSLEEP_SPIN);
1469                 if (status != TAVOR_CMD_SUCCESS) {
1470                         TAVOR_WARNING(state, "failed to write MCG entry");
1471                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1472                             "%08x\n", status);
1473                         TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1474                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1475                             tnf_uint, indx, curr_indx);
1476                         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1477                         return (ibc_get_ci_failure(0));
1478                 }
1479 
1480                 /*
1481                  * Copy all the software tracking information from the next
1482                  * entry on the "shadow" MCG list into the current entry on
1483                  * the list.  Then invalidate (zero out) the other "shadow"
1484                  * list entry.
1485                  */
1486                 bcopy(next_mcg, curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1487                 bzero(next_mcg, sizeof (struct tavor_sw_mcg_list_s));
1488 
1489                 /*
1490                  * Free up the Tavor MCG entry resource used by the "next"
1491                  * MCG entry.  That resource is no longer needed by any
1492                  * MCG entry which is first on a hash chain (like the "next"
1493                  * entry has just become).
1494                  */
1495                 tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1496 
1497                 TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1498                 return (DDI_SUCCESS);
1499         }
1500 
1501         /*
1502          * Else if this is the last entry on the hash chain (or a middle
1503          * entry, then we update the previous entry's "next_gid_index" field
1504          * to make it point instead to the next entry on the chain.  By
1505          * skipping over the removed entry in this way, we can then free up
1506          * any resources associated with the current entry.  Note:  We don't
1507          * need to invalidate the "skipped over" hardware entry because it
1508          * will no be longer connected to any hash chains, and if/when it is
1509          * finally re-used, it will be written with entirely new values.
1510          */
1511 
1512         /*
1513          * Read the next MCG entry into the temporary MCG.  Note:  In general,
1514          * this operation shouldn't fail.  If it does, then it is an
1515          * indication that something (probably in HW, but maybe in SW) has
1516          * gone seriously wrong.
1517          */
1518         status = tavor_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1519             TAVOR_CMD_NOSLEEP_SPIN);
1520         if (status != TAVOR_CMD_SUCCESS) {
1521                 TAVOR_WARNING(state, "failed to read MCG entry");
1522                 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1523                     status);
1524                 TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1525                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1526                     tnf_uint, indx, prev_indx);
1527                 TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1528                 return (ibc_get_ci_failure(0));
1529         }
1530 
1531         /*
1532          * Finally, we update the "next_gid_indx" field in the temporary MCG
1533          * and attempt to write the entry back into the Tavor MCG table.  If
1534          * this succeeds, then we update the "shadow" list to reflect the
1535          * change, free up the Tavor MCG entry resource that was associated
1536          * with the current entry, and return success.  Note:  In general,
1537          * this operation shouldn't fail.  If it does, then it is an indication
1538          * that something (probably in HW, but maybe in SW) has gone seriously
1539          * wrong.
1540          */
1541         mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1542         status = tavor_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1543             TAVOR_CMD_NOSLEEP_SPIN);
1544         if (status != TAVOR_CMD_SUCCESS) {
1545                 TAVOR_WARNING(state, "failed to write MCG entry");
1546                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1547                     status);
1548                 TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1549                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1550                     tnf_uint, indx, prev_indx);
1551                 TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1552                 return (ibc_get_ci_failure(0));
1553         }
1554 
1555         /*
1556          * Get the pointer to the "shadow" MCG list entry for the previous
1557          * MCG.  Update its "mcg_next_indx" to point to the next entry
1558          * the one after the current entry. Note:  This next index may be
1559          * zero, indicating the end of the list.
1560          */
1561         prev_mcg = &state->ts_mcghdl[prev_indx];
1562         prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1563 
1564         /*
1565          * Free up the Tavor MCG entry resource used by the current entry.
1566          * This resource is no longer needed because the chain now skips over
1567          * the current entry.  Then invalidate (zero out) the current "shadow"
1568          * list entry.
1569          */
1570         tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1571         bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1572 
1573         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1574         return (DDI_SUCCESS);
1575 }
1576 
1577 
1578 /*
1579  * tavor_mcg_entry_invalidate()
1580  *    Context: Can be called only from user or kernel context.
1581  */
1582 static int
1583 tavor_mcg_entry_invalidate(tavor_state_t *state, tavor_hw_mcg_t *mcg_entry,
1584     uint_t indx)
1585 {
1586         int             status;
1587 
1588         TAVOR_TNF_ENTER(tavor_mcg_entry_invalidate);
1589 
1590         /*
1591          * Invalidate the hardware MCG entry by zeroing out this temporary
1592          * MCG and writing it the the hardware.  Note: In general, this
1593          * operation shouldn't fail.  If it does, then it is an indication
1594          * that something (probably in HW, but maybe in SW) has gone seriously
1595          * wrong.
1596          */
1597         bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
1598         status = tavor_write_mgm_cmd_post(state, mcg_entry, indx,
1599             TAVOR_CMD_NOSLEEP_SPIN);
1600         if (status != TAVOR_CMD_SUCCESS) {
1601                 TAVOR_WARNING(state, "failed to write MCG entry");
1602                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1603                     status);
1604                 TNF_PROBE_2(tavor_mcg_entry_invalidate_write_mgm_cmd_fail,
1605                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1606                     tnf_uint, indx, indx);
1607                 TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1608                 return (ibc_get_ci_failure(0));
1609         }
1610 
1611         TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1612         return (DDI_SUCCESS);
1613 }
1614 
1615 
1616 /*
1617  * tavor_mgid_is_valid()
1618  *    Context: Can be called from interrupt or base context.
1619  */
1620 static int
1621 tavor_mgid_is_valid(ib_gid_t gid)
1622 {
1623         uint_t          topbits, flags, scope;
1624 
1625         TAVOR_TNF_ENTER(tavor_mgid_is_valid);
1626 
1627         /*
1628          * According to IBA 1.1 specification (section 4.1.1) a valid
1629          * "multicast GID" must have its top eight bits set to all ones
1630          */
1631         topbits = (gid.gid_prefix >> TAVOR_MCG_TOPBITS_SHIFT) &
1632             TAVOR_MCG_TOPBITS_MASK;
1633         if (topbits != TAVOR_MCG_TOPBITS) {
1634                 TNF_PROBE_0(tavor_mgid_is_valid_invbits_fail, TAVOR_TNF_ERROR,
1635                     "");
1636                 TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1637                 return (0);
1638         }
1639 
1640         /*
1641          * The next 4 bits are the "flag" bits.  These are valid only
1642          * if they are "0" (which correspond to permanently assigned/
1643          * "well-known" multicast GIDs) or "1" (for so-called "transient"
1644          * multicast GIDs).  All other values are reserved.
1645          */
1646         flags = (gid.gid_prefix >> TAVOR_MCG_FLAGS_SHIFT) &
1647             TAVOR_MCG_FLAGS_MASK;
1648         if (!((flags == TAVOR_MCG_FLAGS_PERM) ||
1649             (flags == TAVOR_MCG_FLAGS_NONPERM))) {
1650                 TNF_PROBE_1(tavor_mgid_is_valid_invflags_fail, TAVOR_TNF_ERROR,
1651                     "", tnf_uint, flags, flags);
1652                 TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1653                 return (0);
1654         }
1655 
1656         /*
1657          * The next 4 bits are the "scope" bits.  These are valid only
1658          * if they are "2" (Link-local), "5" (Site-local), "8"
1659          * (Organization-local) or "E" (Global).  All other values
1660          * are reserved (or currently unassigned).
1661          */
1662         scope = (gid.gid_prefix >> TAVOR_MCG_SCOPE_SHIFT) &
1663             TAVOR_MCG_SCOPE_MASK;
1664         if (!((scope == TAVOR_MCG_SCOPE_LINKLOC) ||
1665             (scope == TAVOR_MCG_SCOPE_SITELOC)   ||
1666             (scope == TAVOR_MCG_SCOPE_ORGLOC)    ||
1667             (scope == TAVOR_MCG_SCOPE_GLOBAL))) {
1668                 TNF_PROBE_1(tavor_mgid_is_valid_invscope_fail, TAVOR_TNF_ERROR,
1669                     "", tnf_uint, scope, scope);
1670                 TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1671                 return (0);
1672         }
1673 
1674         /*
1675          * If it passes all of the above checks, then we will consider it
1676          * a valid multicast GID.
1677          */
1678         TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1679         return (1);
1680 }
1681 
1682 
1683 /*
1684  * tavor_mlid_is_valid()
1685  *    Context: Can be called from interrupt or base context.
1686  */
1687 static int
1688 tavor_mlid_is_valid(ib_lid_t lid)
1689 {
1690         TAVOR_TNF_ENTER(tavor_mlid_is_valid);
1691 
1692         /*
1693          * According to IBA 1.1 specification (section 4.1.1) a valid
1694          * "multicast DLID" must be between 0xC000 and 0xFFFE.
1695          */
1696         if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1697                 TNF_PROBE_1(tavor_mlid_is_valid_invdlid_fail, TAVOR_TNF_ERROR,
1698                     "", tnf_uint, mlid, lid);
1699                 TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1700                 return (0);
1701         }
1702 
1703         TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1704         return (1);
1705 }
1706 
1707 
1708 /*
1709  * tavor_pd_alloc()
1710  *    Context: Can be called only from user or kernel context.
1711  */
1712 int
1713 tavor_pd_alloc(tavor_state_t *state, tavor_pdhdl_t *pdhdl, uint_t sleepflag)
1714 {
1715         tavor_rsrc_t    *rsrc;
1716         tavor_pdhdl_t   pd;
1717         int             status;
1718 
1719         TAVOR_TNF_ENTER(tavor_pd_alloc);
1720 
1721         /*
1722          * Allocate the software structure for tracking the protection domain
1723          * (i.e. the Tavor Protection Domain handle).  By default each PD
1724          * structure will have a unique PD number assigned to it.  All that
1725          * is necessary is for software to initialize the PD reference count
1726          * (to zero) and return success.
1727          */
1728         status = tavor_rsrc_alloc(state, TAVOR_PDHDL, 1, sleepflag, &rsrc);
1729         if (status != DDI_SUCCESS) {
1730                 TNF_PROBE_0(tavor_pd_alloc_rsrcalloc_fail, TAVOR_TNF_ERROR, "");
1731                 TAVOR_TNF_EXIT(tavor_pd_alloc);
1732                 return (IBT_INSUFF_RESOURCE);
1733         }
1734         pd = (tavor_pdhdl_t)rsrc->tr_addr;
1735         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1736 
1737         pd->pd_refcnt = 0;
1738         *pdhdl = pd;
1739 
1740         TAVOR_TNF_EXIT(tavor_pd_alloc);
1741         return (DDI_SUCCESS);
1742 }
1743 
1744 
1745 /*
1746  * tavor_pd_free()
1747  *    Context: Can be called only from user or kernel context.
1748  */
1749 int
1750 tavor_pd_free(tavor_state_t *state, tavor_pdhdl_t *pdhdl)
1751 {
1752         tavor_rsrc_t    *rsrc;
1753         tavor_pdhdl_t   pd;
1754 
1755         TAVOR_TNF_ENTER(tavor_pd_free);
1756 
1757         /*
1758          * Pull all the necessary information from the Tavor Protection Domain
1759          * handle.  This is necessary here because the resource for the
1760          * PD is going to be freed up as part of this operation.
1761          */
1762         pd   = *pdhdl;
1763         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1764         rsrc = pd->pd_rsrcp;
1765 
1766         /*
1767          * Check the PD reference count.  If the reference count is non-zero,
1768          * then it means that this protection domain is still referenced by
1769          * some memory region, queue pair, address handle, or other IB object
1770          * If it is non-zero, then return an error.  Otherwise, free the
1771          * Tavor resource and return success.
1772          */
1773         if (pd->pd_refcnt != 0) {
1774                 TNF_PROBE_1(tavor_pd_free_refcnt_fail, TAVOR_TNF_ERROR, "",
1775                     tnf_int, refcnt, pd->pd_refcnt);
1776                 TAVOR_TNF_EXIT(tavor_pd_free);
1777                 return (IBT_PD_IN_USE);
1778         }
1779 
1780         /* Free the Tavor Protection Domain handle */
1781         tavor_rsrc_free(state, &rsrc);
1782 
1783         /* Set the pdhdl pointer to NULL and return success */
1784         *pdhdl = (tavor_pdhdl_t)NULL;
1785 
1786         TAVOR_TNF_EXIT(tavor_pd_free);
1787         return (DDI_SUCCESS);
1788 }
1789 
1790 
1791 /*
1792  * tavor_pd_refcnt_inc()
1793  *    Context: Can be called from interrupt or base context.
1794  */
1795 void
1796 tavor_pd_refcnt_inc(tavor_pdhdl_t pd)
1797 {
1798         /* Increment the protection domain's reference count */
1799         mutex_enter(&pd->pd_lock);
1800         TNF_PROBE_1_DEBUG(tavor_pd_refcnt_inc, TAVOR_TNF_TRACE, "",
1801             tnf_uint, refcnt, pd->pd_refcnt);
1802         pd->pd_refcnt++;
1803         mutex_exit(&pd->pd_lock);
1804 
1805 }
1806 
1807 
1808 /*
1809  * tavor_pd_refcnt_dec()
1810  *    Context: Can be called from interrupt or base context.
1811  */
1812 void
1813 tavor_pd_refcnt_dec(tavor_pdhdl_t pd)
1814 {
1815         /* Decrement the protection domain's reference count */
1816         mutex_enter(&pd->pd_lock);
1817         pd->pd_refcnt--;
1818         TNF_PROBE_1_DEBUG(tavor_pd_refcnt_dec, TAVOR_TNF_TRACE, "",
1819             tnf_uint, refcnt, pd->pd_refcnt);
1820         mutex_exit(&pd->pd_lock);
1821 
1822 }
1823 
1824 
1825 /*
1826  * tavor_port_query()
1827  *    Context: Can be called only from user or kernel context.
1828  */
1829 int
1830 tavor_port_query(tavor_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1831 {
1832         sm_portinfo_t           portinfo;
1833         sm_guidinfo_t           guidinfo;
1834         sm_pkey_table_t         pkeytable;
1835         ib_gid_t                *sgid;
1836         uint_t                  sgid_max, pkey_max, tbl_size;
1837         int                     i, j, indx, status;
1838 
1839         TAVOR_TNF_ENTER(tavor_port_query);
1840 
1841         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1842 
1843         /* Validate that specified port number is legal */
1844         if (!tavor_portnum_is_valid(state, port)) {
1845                 TNF_PROBE_1(tavor_port_query_inv_portnum_fail,
1846                     TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1847                 TAVOR_TNF_EXIT(tavor_port_query);
1848                 return (IBT_HCA_PORT_INVALID);
1849         }
1850 
1851         /*
1852          * We use the Tavor MAD_IFC command to post a GetPortInfo MAD
1853          * to the firmware (for the specified port number).  This returns
1854          * a full PortInfo MAD (in "portinfo") which we subsequently
1855          * parse to fill in the "ibt_hca_portinfo_t" structure returned
1856          * to the IBTF.
1857          */
1858         status = tavor_getportinfo_cmd_post(state, port,
1859             TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1860         if (status != TAVOR_CMD_SUCCESS) {
1861                 cmn_err(CE_CONT, "Tavor: GetPortInfo (port %02d) command "
1862                     "failed: %08x\n", port, status);
1863                 TNF_PROBE_1(tavor_port_query_getportinfo_cmd_fail,
1864                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1865                 TAVOR_TNF_EXIT(tavor_port_query);
1866                 return (ibc_get_ci_failure(0));
1867         }
1868 
1869         /*
1870          * Parse the PortInfo MAD and fill in the IBTF structure
1871          */
1872         pi->p_base_lid               = portinfo.LID;
1873         pi->p_qkey_violations        = portinfo.Q_KeyViolations;
1874         pi->p_pkey_violations        = portinfo.P_KeyViolations;
1875         pi->p_sm_sl          = portinfo.MasterSMSL;
1876         pi->p_sm_lid         = portinfo.MasterSMLID;
1877         pi->p_linkstate              = portinfo.PortState;
1878         pi->p_port_num               = portinfo.LocalPortNum;
1879         pi->p_phys_state     = portinfo.PortPhysicalState;
1880         pi->p_width_supported        = portinfo.LinkWidthSupported;
1881         pi->p_width_enabled  = portinfo.LinkWidthEnabled;
1882         pi->p_width_active   = portinfo.LinkWidthActive;
1883         pi->p_speed_supported        = portinfo.LinkSpeedSupported;
1884         pi->p_speed_enabled  = portinfo.LinkSpeedEnabled;
1885         pi->p_speed_active   = portinfo.LinkSpeedActive;
1886         pi->p_mtu            = portinfo.MTUCap;
1887         pi->p_lmc            = portinfo.LMC;
1888         pi->p_max_vl         = portinfo.VLCap;
1889         pi->p_subnet_timeout = portinfo.SubnetTimeOut;
1890         pi->p_msg_sz         = ((uint32_t)1 << TAVOR_QP_LOG_MAX_MSGSZ);
1891         tbl_size = state->ts_cfg_profile->cp_log_max_gidtbl;
1892         pi->p_sgid_tbl_sz    = (1 << tbl_size);
1893         tbl_size = state->ts_cfg_profile->cp_log_max_pkeytbl;
1894         pi->p_pkey_tbl_sz    = (1 << tbl_size);
1895 
1896         /*
1897          * Convert InfiniBand-defined port capability flags to the format
1898          * specified by the IBTF
1899          */
1900         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1901                 pi->p_capabilities |= IBT_PORT_CAP_SM;
1902         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1903                 pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1904         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1905                 pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1906         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1907                 pi->p_capabilities |= IBT_PORT_CAP_DM;
1908         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1909                 pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1910 
1911         /*
1912          * Fill in the SGID table.  Since the only access to the Tavor
1913          * GID tables is through the firmware's MAD_IFC interface, we
1914          * post as many GetGUIDInfo MADs as necessary to read in the entire
1915          * contents of the SGID table (for the specified port).  Note:  The
1916          * GetGUIDInfo command only gets eight GUIDs per operation.  These
1917          * GUIDs are then appended to the GID prefix for the port (from the
1918          * GetPortInfo above) to form the entire SGID table.
1919          */
1920         for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1921                 status = tavor_getguidinfo_cmd_post(state, port, i >> 3,
1922                     TAVOR_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1923                 if (status != TAVOR_CMD_SUCCESS) {
1924                         cmn_err(CE_CONT, "Tavor: GetGUIDInfo (port %02d) "
1925                             "command failed: %08x\n", port, status);
1926                         TNF_PROBE_1(tavor_port_query_getguidinfo_cmd_fail,
1927                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1928                         TAVOR_TNF_EXIT(tavor_port_query);
1929                         return (ibc_get_ci_failure(0));
1930                 }
1931 
1932                 /* Figure out how many of the entries are valid */
1933                 sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
1934                 for (j = 0; j < sgid_max; j++) {
1935                         indx = (i + j);
1936                         sgid = &pi->p_sgid_tbl[indx];
1937                         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
1938                         sgid->gid_prefix = portinfo.GidPrefix;
1939                         sgid->gid_guid        = guidinfo.GUIDBlocks[j];
1940                 }
1941         }
1942 
1943         /*
1944          * Fill in the PKey table.  Just as for the GID tables above, the
1945          * only access to the Tavor PKey tables is through the firmware's
1946          * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
1947          * to read in the entire contents of the PKey table (for the specified
1948          * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
1949          * operation.
1950          */
1951         for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
1952                 status = tavor_getpkeytable_cmd_post(state, port, i,
1953                     TAVOR_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
1954                 if (status != TAVOR_CMD_SUCCESS) {
1955                         cmn_err(CE_CONT, "Tavor: GetPKeyTable (port %02d) "
1956                             "command failed: %08x\n", port, status);
1957                         TNF_PROBE_1(tavor_port_query_getpkeytable_cmd_fail,
1958                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1959                         TAVOR_TNF_EXIT(tavor_port_query);
1960                         return (ibc_get_ci_failure(0));
1961                 }
1962 
1963                 /* Figure out how many of the entries are valid */
1964                 pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
1965                 for (j = 0; j < pkey_max; j++) {
1966                         indx = (i + j);
1967                         pi->p_pkey_tbl[indx] = pkeytable.P_KeyTableBlocks[j];
1968                 }
1969         }
1970 
1971         TAVOR_TNF_EXIT(tavor_port_query);
1972         return (DDI_SUCCESS);
1973 }
1974 
1975 
1976 /*
1977  * tavor_port_modify()
1978  *    Context: Can be called only from user or kernel context.
1979  */
1980 /* ARGSUSED */
1981 int
1982 tavor_port_modify(tavor_state_t *state, uint8_t port,
1983     ibt_port_modify_flags_t flags, uint8_t init_type)
1984 {
1985         sm_portinfo_t   portinfo;
1986         uint32_t        capmask, reset_qkey;
1987         int             status;
1988 
1989         TAVOR_TNF_ENTER(tavor_port_modify);
1990 
1991         /*
1992          * Return an error if either of the unsupported flags are set
1993          */
1994         if ((flags & IBT_PORT_SHUTDOWN) ||
1995             (flags & IBT_PORT_SET_INIT_TYPE)) {
1996                 TNF_PROBE_1(tavor_port_modify_inv_flags_fail,
1997                     TAVOR_TNF_ERROR, "", tnf_uint, flags, flags);
1998                 TAVOR_TNF_EXIT(tavor_port_modify);
1999                 return (IBT_NOT_SUPPORTED);
2000         }
2001 
2002         /*
2003          * Determine whether we are trying to reset the QKey counter
2004          */
2005         reset_qkey = (flags & IBT_PORT_RESET_QKEY) ? 1 : 0;
2006 
2007         /* Validate that specified port number is legal */
2008         if (!tavor_portnum_is_valid(state, port)) {
2009                 TNF_PROBE_1(tavor_port_modify_inv_portnum_fail,
2010                     TAVOR_TNF_ERROR, "", tnf_uint, port, port);
2011                 TAVOR_TNF_EXIT(tavor_port_modify);
2012                 return (IBT_HCA_PORT_INVALID);
2013         }
2014 
2015         /*
2016          * Use the Tavor MAD_IFC command to post a GetPortInfo MAD to the
2017          * firmware (for the specified port number).  This returns a full
2018          * PortInfo MAD (in "portinfo") from which we pull the current
2019          * capability mask.  We then modify the capability mask as directed
2020          * by the "pmod_flags" field, and write the updated capability mask
2021          * using the Tavor SET_IB command (below).
2022          */
2023         status = tavor_getportinfo_cmd_post(state, port,
2024             TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2025         if (status != TAVOR_CMD_SUCCESS) {
2026                 TNF_PROBE_1(tavor_port_modify_getportinfo_cmd_fail,
2027                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2028                 TAVOR_TNF_EXIT(tavor_port_modify);
2029                 return (ibc_get_ci_failure(0));
2030         }
2031 
2032         /*
2033          * Convert InfiniBand-defined port capability flags to the format
2034          * specified by the IBTF.  Specifically, we modify the capability
2035          * mask based on the specified values.
2036          */
2037         capmask = portinfo.CapabilityMask;
2038 
2039         if (flags & IBT_PORT_RESET_SM)
2040                 capmask &= ~SM_CAP_MASK_IS_SM;
2041         else if (flags & IBT_PORT_SET_SM)
2042                 capmask |= SM_CAP_MASK_IS_SM;
2043 
2044         if (flags & IBT_PORT_RESET_SNMP)
2045                 capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2046         else if (flags & IBT_PORT_SET_SNMP)
2047                 capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2048 
2049         if (flags & IBT_PORT_RESET_DEVMGT)
2050                 capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2051         else if (flags & IBT_PORT_SET_DEVMGT)
2052                 capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2053 
2054         if (flags & IBT_PORT_RESET_VENDOR)
2055                 capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2056         else if (flags & IBT_PORT_SET_VENDOR)
2057                 capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2058 
2059         /*
2060          * Use the Tavor SET_IB command to update the capability mask and
2061          * (possibly) reset the QKey violation counter for the specified port.
2062          * Note: In general, this operation shouldn't fail.  If it does, then
2063          * it is an indication that something (probably in HW, but maybe in
2064          * SW) has gone seriously wrong.
2065          */
2066         status = tavor_set_ib_cmd_post(state, capmask, port, reset_qkey,
2067             TAVOR_SLEEPFLAG_FOR_CONTEXT());
2068         if (status != TAVOR_CMD_SUCCESS) {
2069                 TAVOR_WARNING(state, "failed to modify port capabilities");
2070                 cmn_err(CE_CONT, "Tavor: SET_IB (port %02d) command failed: "
2071                     "%08x\n", port, status);
2072                 TNF_PROBE_1(tavor_port_modify_set_ib_cmd_fail,
2073                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2074                 TAVOR_TNF_EXIT(tavor_port_modify);
2075                 return (ibc_get_ci_failure(0));
2076         }
2077 
2078         TAVOR_TNF_EXIT(tavor_port_modify);
2079         return (DDI_SUCCESS);
2080 }
2081 
2082 
2083 /*
2084  * tavor_set_addr_path()
2085  *    Context: Can be called from interrupt or base context.
2086  *
2087  * Note: This routine is used for two purposes.  It is used to fill in the
2088  * Tavor UDAV fields, and it is used to fill in the address path information
2089  * for QPs.  Because the two Tavor structures are similar, common fields can
2090  * be filled in here.  Because they are slightly different, however, we pass
2091  * an additional flag to indicate which type is being filled.
2092  */
2093 int
2094 tavor_set_addr_path(tavor_state_t *state, ibt_adds_vect_t *av,
2095     tavor_hw_addr_path_t *path, uint_t type, tavor_qphdl_t qp)
2096 {
2097         uint_t          gidtbl_sz;
2098 
2099         TAVOR_TNF_ENTER(tavor_set_addr_path);
2100 
2101         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2102         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2103 
2104         path->ml_path        = av->av_src_path;
2105         path->rlid   = av->av_dlid;
2106         path->sl     = av->av_srvl;
2107 
2108         /* Port number only valid (in "av_port_num") if this is a UDAV */
2109         if (type == TAVOR_ADDRPATH_UDAV) {
2110                 path->portnum = av->av_port_num;
2111         }
2112 
2113         /*
2114          * Validate (and fill in) static rate.
2115          *
2116          * The stat_rate_sup is used to decide how to set the rate and
2117          * if it is zero, the driver uses the old interface.
2118          */
2119         if (state->ts_devlim.stat_rate_sup) {
2120                 if (av->av_srate == IBT_SRATE_20) {
2121                         path->max_stat_rate = 0; /* 4x@DDR injection rate */
2122                 } else if (av->av_srate == IBT_SRATE_5) {
2123                         path->max_stat_rate = 3; /* 1x@DDR injection rate */
2124                 } else if (av->av_srate == IBT_SRATE_10) {
2125                         path->max_stat_rate = 2; /* 4x@SDR injection rate */
2126                 } else if (av->av_srate == IBT_SRATE_2) {
2127                         path->max_stat_rate = 1; /* 1x@SDR injection rate */
2128                 } else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2129                         path->max_stat_rate = 0; /* Max */
2130                 } else {
2131                         TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2132                             TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2133                         TAVOR_TNF_EXIT(tavor_set_addr_path);
2134                         return (IBT_STATIC_RATE_INVALID);
2135                 }
2136         } else {
2137                 if (av->av_srate == IBT_SRATE_10) {
2138                         path->max_stat_rate = 0; /* 4x@SDR injection rate */
2139                 } else if (av->av_srate == IBT_SRATE_2) {
2140                         path->max_stat_rate = 1; /* 1x@SDR injection rate */
2141                 } else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2142                         path->max_stat_rate = 0; /* Max */
2143                 } else {
2144                         TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2145                             TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2146                         TAVOR_TNF_EXIT(tavor_set_addr_path);
2147                         return (IBT_STATIC_RATE_INVALID);
2148                 }
2149         }
2150 
2151         /*
2152          * If this is a QP operation save asoft copy.
2153          */
2154         if (qp) {
2155                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2156                 qp->qp_save_srate = av->av_srate;
2157         }
2158 
2159         /* If "grh" flag is set, then check for valid SGID index too */
2160         gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2161         if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2162                 TNF_PROBE_1(tavor_set_addr_path_inv_sgid_ix_fail,
2163                     TAVOR_TNF_ERROR, "", tnf_uint, sgid_ix, av->av_sgid_ix);
2164                 TAVOR_TNF_EXIT(tavor_set_addr_path);
2165                 return (IBT_SGID_INVALID);
2166         }
2167 
2168         /*
2169          * Fill in all "global" values regardless of the value in the GRH
2170          * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2171          * hardware will ignore the other "global" values as necessary.  Note:
2172          * SW does this here to enable later query operations to return
2173          * exactly the same params that were passed when the addr path was
2174          * last written.
2175          */
2176         path->grh = av->av_send_grh;
2177         if (type == TAVOR_ADDRPATH_QP) {
2178                 path->mgid_index = av->av_sgid_ix;
2179         } else {
2180                 /*
2181                  * For Tavor UDAV, the "mgid_index" field is the index into
2182                  * a combined table (not a per-port table). So some extra
2183                  * calculations are necessary.
2184                  */
2185                 path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2186                     av->av_sgid_ix;
2187         }
2188         path->flow_label = av->av_flow;
2189         path->tclass  = av->av_tclass;
2190         path->hop_limit       = av->av_hop;
2191         path->rgid_h  = av->av_dgid.gid_prefix;
2192 
2193         /*
2194          * According to Tavor PRM, the (31:0) part of rgid_l must be set to
2195          * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2196          * only need to do it for UDAV's.  So we enforce that here.
2197          *
2198          * NOTE: The entire 64 bits worth of GUID info is actually being
2199          * preserved (for UDAVs) by the callers of this function
2200          * (tavor_ah_alloc() and tavor_ah_modify()) and as long as the
2201          * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2202          * "don't care".
2203          */
2204         if ((path->grh) || (type == TAVOR_ADDRPATH_QP)) {
2205                 path->rgid_l = av->av_dgid.gid_guid;
2206         } else {
2207                 path->rgid_l = 0x2;
2208         }
2209 
2210         TAVOR_TNF_EXIT(tavor_set_addr_path);
2211         return (DDI_SUCCESS);
2212 }
2213 
2214 
2215 /*
2216  * tavor_get_addr_path()
2217  *    Context: Can be called from interrupt or base context.
2218  *
2219  * Note: Just like tavor_set_addr_path() above, this routine is used for two
2220  * purposes.  It is used to read in the Tavor UDAV fields, and it is used to
2221  * read in the address path information for QPs.  Because the two Tavor
2222  * structures are similar, common fields can be read in here.  But because
2223  * they are slightly different, we pass an additional flag to indicate which
2224  * type is being read.
2225  */
2226 void
2227 tavor_get_addr_path(tavor_state_t *state, tavor_hw_addr_path_t *path,
2228     ibt_adds_vect_t *av, uint_t type, tavor_qphdl_t qp)
2229 {
2230         uint_t          gidtbl_sz;
2231 
2232         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2233         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2234 
2235         av->av_src_path      = path->ml_path;
2236         av->av_port_num      = path->portnum;
2237         av->av_dlid  = path->rlid;
2238         av->av_srvl  = path->sl;
2239 
2240         /*
2241          * Set "av_ipd" value from max_stat_rate.
2242          */
2243         if (qp) {
2244                 /*
2245                  * If a QP operation use the soft copy
2246                  */
2247                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2248                 av->av_srate = qp->qp_save_srate;
2249         } else {
2250                 /*
2251                  * The stat_rate_sup is used to decide how the srate value is
2252                  * set and
2253                  * if it is zero, the driver uses the old interface.
2254                  */
2255                 if (state->ts_devlim.stat_rate_sup) {
2256                         if (path->max_stat_rate      == 0) {
2257                                 av->av_srate = IBT_SRATE_20; /* 4x@DDR rate */
2258                         } else if (path->max_stat_rate       == 1) {
2259                                 av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2260                         } else if (path->max_stat_rate       == 2) {
2261                                 av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2262                         } else if (path->max_stat_rate       == 3) {
2263                                 av->av_srate = IBT_SRATE_5;  /* 1xDDR rate */
2264                         }
2265                 } else {
2266                         if (path->max_stat_rate      == 0) {
2267                                 av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2268                         } else if (path->max_stat_rate       == 1) {
2269                                 av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2270                         }
2271                 }
2272         }
2273 
2274         /*
2275          * Extract all "global" values regardless of the value in the GRH
2276          * flag.  Because "av_send_grh" is set only if "grh" is set, software
2277          * knows to ignore the other "global" values as necessary.  Note: SW
2278          * does it this way to enable these query operations to return exactly
2279          * the same params that were passed when the addr path was last written.
2280          */
2281         av->av_send_grh              = path->grh;
2282         if (type == TAVOR_ADDRPATH_QP) {
2283                 av->av_sgid_ix  = path->mgid_index;
2284         } else {
2285                 /*
2286                  * For Tavor UDAV, the "mgid_index" field is the index into
2287                  * a combined table (not a per-port table). So some extra
2288                  * calculations are necessary.
2289                  */
2290                 gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2291                 av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2292                     gidtbl_sz);
2293         }
2294         av->av_flow          = path->flow_label;
2295         av->av_tclass                = path->tclass;
2296         av->av_hop           = path->hop_limit;
2297         av->av_dgid.gid_prefix       = path->rgid_h;
2298         av->av_dgid.gid_guid = path->rgid_l;
2299 }
2300 
2301 
2302 /*
2303  * tavor_portnum_is_valid()
2304  *    Context: Can be called from interrupt or base context.
2305  */
2306 int
2307 tavor_portnum_is_valid(tavor_state_t *state, uint_t portnum)
2308 {
2309         uint_t  max_port;
2310 
2311         max_port = state->ts_cfg_profile->cp_num_ports;
2312         if ((portnum <= max_port) && (portnum != 0)) {
2313                 return (1);
2314         } else {
2315                 return (0);
2316         }
2317 }
2318 
2319 
2320 /*
2321  * tavor_pkeyindex_is_valid()
2322  *    Context: Can be called from interrupt or base context.
2323  */
2324 int
2325 tavor_pkeyindex_is_valid(tavor_state_t *state, uint_t pkeyindx)
2326 {
2327         uint_t  max_pkeyindx;
2328 
2329         max_pkeyindx = 1 << state->ts_cfg_profile->cp_log_max_pkeytbl;
2330         if (pkeyindx < max_pkeyindx) {
2331                 return (1);
2332         } else {
2333                 return (0);
2334         }
2335 }
2336 
2337 
2338 /*
2339  * tavor_queue_alloc()
2340  *    Context: Can be called from interrupt or base context.
2341  */
2342 int
2343 tavor_queue_alloc(tavor_state_t *state, tavor_qalloc_info_t *qa_info,
2344     uint_t sleepflag)
2345 {
2346         ddi_dma_attr_t          dma_attr;
2347         int                     (*callback)(caddr_t);
2348         uint64_t                realsize, alloc_mask;
2349         uint_t                  dma_xfer_mode, type;
2350         int                     flag, status;
2351 
2352         TAVOR_TNF_ENTER(tavor_queue_alloc);
2353 
2354         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2355 
2356         /* Set the callback flag appropriately */
2357         callback = (sleepflag == TAVOR_SLEEP) ? DDI_DMA_SLEEP :
2358             DDI_DMA_DONTWAIT;
2359 
2360         /*
2361          * Initialize many of the default DMA attributes.  Then set additional
2362          * alignment restrictions as necessary for the queue memory.  Also
2363          * respect the configured value for IOMMU bypass
2364          */
2365         tavor_dma_attr_init(&dma_attr);
2366         dma_attr.dma_attr_align = qa_info->qa_bind_align;
2367         type = state->ts_cfg_profile->cp_iommu_bypass;
2368         if (type == TAVOR_BINDMEM_BYPASS) {
2369                 dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2370         }
2371 
2372         /* Allocate a DMA handle */
2373         status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, callback, NULL,
2374             &qa_info->qa_dmahdl);
2375         if (status != DDI_SUCCESS) {
2376                 TNF_PROBE_0(tavor_queue_alloc_dmahdl_fail, TAVOR_TNF_ERROR, "");
2377                 TAVOR_TNF_EXIT(tavor_queue_alloc);
2378                 return (DDI_FAILURE);
2379         }
2380 
2381         /*
2382          * Determine the amount of memory to allocate, depending on the values
2383          * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2384          * to solve here is that allocating a DMA handle with IOMMU bypass
2385          * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2386          * that are less than the page size.  Since we may need stricter
2387          * alignments on the memory allocated by ddi_dma_mem_alloc() (e.g. in
2388          * Tavor QP work queue memory allocation), we use the following method
2389          * to calculate how much additional memory to request, and we enforce
2390          * our own alignment on the allocated result.
2391          */
2392         alloc_mask = qa_info->qa_alloc_align - 1;
2393         if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2394                 realsize = qa_info->qa_size;
2395         } else {
2396                 realsize = qa_info->qa_size + alloc_mask;
2397         }
2398 
2399         /*
2400          * If we are to allocate the queue from system memory, then use
2401          * ddi_dma_mem_alloc() to find the space.  Otherwise, if we are to
2402          * allocate the queue from locally-attached DDR memory, then use the
2403          * vmem allocator to find the space.  In either case, return a pointer
2404          * to the memory range allocated (including any necessary alignment
2405          * adjustments), the "real" memory pointer, the "real" size, and a
2406          * ddi_acc_handle_t to use when reading from/writing to the memory.
2407          */
2408         if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2409 
2410                 /*
2411                  * Determine whether to map STREAMING or CONSISTENT.  This is
2412                  * based on the value set in the configuration profile at
2413                  * attach time.
2414                  */
2415                 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
2416 
2417                 /* Allocate system memory for the queue */
2418                 status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2419                     &state->ts_reg_accattr, dma_xfer_mode, callback, NULL,
2420                     (caddr_t *)&qa_info->qa_buf_real,
2421                     (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2422                 if (status != DDI_SUCCESS) {
2423                         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2424                         TNF_PROBE_0(tavor_queue_alloc_dma_memalloc_fail,
2425                             TAVOR_TNF_ERROR, "");
2426                         TAVOR_TNF_EXIT(tavor_queue_alloc);
2427                         return (DDI_FAILURE);
2428                 }
2429 
2430                 /*
2431                  * Save temporary copy of the real pointer.  (This may be
2432                  * modified in the last step below).
2433                  */
2434                 qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2435 
2436         } else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2437 
2438                 /* Allocate userland mappable memory for the queue */
2439                 flag = (sleepflag == TAVOR_SLEEP) ? DDI_UMEM_SLEEP :
2440                     DDI_UMEM_NOSLEEP;
2441                 qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2442                     &qa_info->qa_umemcookie);
2443                 if (qa_info->qa_buf_real == NULL) {
2444                         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2445                         TNF_PROBE_0(tavor_queue_alloc_umem_fail,
2446                             TAVOR_TNF_ERROR, "");
2447                         TAVOR_TNF_EXIT(tavor_queue_alloc);
2448                         return (DDI_FAILURE);
2449                 }
2450 
2451                 /*
2452                  * Save temporary copy of the real pointer.  (This may be
2453                  * modified in the last step below).
2454                  */
2455                 qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2456 
2457         } else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2458 
2459                 /* Allocate DDR memory for the queue */
2460                 flag = (sleepflag == TAVOR_SLEEP) ? VM_SLEEP : VM_NOSLEEP;
2461                 qa_info->qa_buf_real = (uint32_t *)vmem_xalloc(
2462                     state->ts_ddrvmem, realsize, qa_info->qa_bind_align, 0, 0,
2463                     NULL, NULL, flag);
2464                 if (qa_info->qa_buf_real == NULL) {
2465                         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2466                         TNF_PROBE_0(tavor_queue_alloc_vmxa_fail,
2467                             TAVOR_TNF_ERROR, "");
2468                         TAVOR_TNF_EXIT(tavor_queue_alloc);
2469                         return (DDI_FAILURE);
2470                 }
2471 
2472                 /*
2473                  * Since "qa_buf_real" will be a PCI address (the offset into
2474                  * the DDR memory), we first need to do some calculations to
2475                  * convert it to its kernel mapped address.  (Note: This may
2476                  * be modified again below, when any additional "alloc"
2477                  * alignment constraint is applied).
2478                  */
2479                 qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2480                     state->ts_reg_ddr_baseaddr) + ((uintptr_t)
2481                     qa_info->qa_buf_real - state->ts_ddr.ddr_baseaddr));
2482                 qa_info->qa_buf_realsz       = realsize;
2483                 qa_info->qa_acchdl   = state->ts_reg_ddrhdl;
2484         }
2485 
2486         /*
2487          * The last step is to ensure that the final address ("qa_buf_aligned")
2488          * has the appropriate "alloc" alignment restriction applied to it
2489          * (if necessary).
2490          */
2491         if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2492                 qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2493                     qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2494         }
2495 
2496         TAVOR_TNF_EXIT(tavor_queue_alloc);
2497         return (DDI_SUCCESS);
2498 }
2499 
2500 
2501 /*
2502  * tavor_queue_free()
2503  *    Context: Can be called from interrupt or base context.
2504  */
2505 void
2506 tavor_queue_free(tavor_state_t *state, tavor_qalloc_info_t *qa_info)
2507 {
2508         TAVOR_TNF_ENTER(tavor_queue_free);
2509 
2510         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2511 
2512         /*
2513          * Depending on how (i.e. from where) we allocated the memory for
2514          * this queue, we choose the appropriate method for releasing the
2515          * resources.
2516          */
2517         if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2518 
2519                 ddi_dma_mem_free(&qa_info->qa_acchdl);
2520 
2521         } else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2522 
2523                 ddi_umem_free(qa_info->qa_umemcookie);
2524 
2525         } else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2526 
2527                 vmem_xfree(state->ts_ddrvmem, qa_info->qa_buf_real,
2528                     qa_info->qa_buf_realsz);
2529         }
2530 
2531         /* Always free the dma handle */
2532         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2533 
2534         TAVOR_TNF_EXIT(tavor_queue_free);
2535 }
2536 
2537 
2538 /*
2539  * tavor_dmaattr_get()
2540  *    Context: Can be called from interrupt or base context.
2541  */
2542 void
2543 tavor_dma_attr_init(ddi_dma_attr_t *dma_attr)
2544 {
2545         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*dma_attr))
2546 
2547         dma_attr->dma_attr_version   = DMA_ATTR_V0;
2548         dma_attr->dma_attr_addr_lo   = 0;
2549         dma_attr->dma_attr_addr_hi   = 0xFFFFFFFFFFFFFFFFull;
2550         dma_attr->dma_attr_count_max = 0xFFFFFFFFFFFFFFFFull;
2551         dma_attr->dma_attr_align     = 1;
2552         dma_attr->dma_attr_burstsizes        = 0x3FF;
2553         dma_attr->dma_attr_minxfer   = 1;
2554         dma_attr->dma_attr_maxxfer   = 0xFFFFFFFFFFFFFFFFull;
2555         dma_attr->dma_attr_seg               = 0xFFFFFFFFFFFFFFFFull;
2556         dma_attr->dma_attr_sgllen    = 0x7FFFFFFF;
2557         dma_attr->dma_attr_granular  = 1;
2558         dma_attr->dma_attr_flags     = 0;
2559 }