1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * tavor_misc.c
  29  *    Tavor Miscellaneous routines - Address Handle, Multicast, Protection
  30  *    Domain, and port-related operations
  31  *
  32  *    Implements all the routines necessary for allocating, freeing, querying
  33  *    and modifying Address Handles and Protection Domains.  Also implements
  34  *    all the routines necessary for adding and removing Queue Pairs to/from
  35  *    Multicast Groups.  Lastly, it implements the routines necessary for
  36  *    port-related query and modify operations.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/conf.h>
  41 #include <sys/ddi.h>
  42 #include <sys/sunddi.h>
  43 #include <sys/modctl.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/sysmacros.h>
  46 
  47 #include <sys/ib/adapters/tavor/tavor.h>
  48 
  49 static void tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav,
  50     uint_t flag);
  51 static int tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
  52     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, uint_t *qp_found);
  53 static int tavor_mcg_qplist_remove(tavor_mcghdl_t mcg,
  54     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp);
  55 static void tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp);
  56 static void tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp);
  57 static uint_t tavor_mcg_walk_mgid_hash(tavor_state_t *state,
  58     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
  59 static void tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg,
  60     tavor_hw_mcg_t *mcg_hdr, ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc);
  61 static int tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
  62     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry);
  63 static int tavor_mcg_entry_invalidate(tavor_state_t *state,
  64     tavor_hw_mcg_t *mcg_entry, uint_t indx);
  65 static int tavor_mgid_is_valid(ib_gid_t gid);
  66 static int tavor_mlid_is_valid(ib_lid_t lid);
  67 
  68 
  69 /*
  70  * tavor_ah_alloc()
  71  *    Context: Can be called only from user or kernel context.
  72  */
  73 int
  74 tavor_ah_alloc(tavor_state_t *state, tavor_pdhdl_t pd,
  75     ibt_adds_vect_t *attr_p, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
  76 {
  77         tavor_rsrc_t            *udav, *rsrc;
  78         tavor_hw_udav_t         udav_entry;
  79         tavor_ahhdl_t           ah;
  80         ibt_mr_attr_t           mr_attr;
  81         tavor_mr_options_t      op;
  82         tavor_mrhdl_t           mr;
  83         uint64_t                data;
  84         uint32_t                size;
  85         int                     status, i, flag;
  86         char                    *errormsg;
  87 
  88         TAVOR_TNF_ENTER(tavor_ah_alloc);
  89 
  90         /*
  91          * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
  92          * indicate that we wish to allocate an "invalid" (i.e. empty)
  93          * address handle XXX
  94          */
  95 
  96         /* Validate that specified port number is legal */
  97         if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
  98                 /* Set "status" and "errormsg" and goto failure */
  99                 TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
 100                 goto ahalloc_fail;
 101         }
 102 
 103         /*
 104          * Allocate a UDAV entry.  This will be filled in with all the
 105          * necessary parameters to define the Address Handle.  Unlike the
 106          * other hardware resources no ownership transfer takes place as
 107          * these UDAV entries are always owned by hardware.
 108          */
 109         status = tavor_rsrc_alloc(state, TAVOR_UDAV, 1, sleepflag, &udav);
 110         if (status != DDI_SUCCESS) {
 111                 /* Set "status" and "errormsg" and goto failure */
 112                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed UDAV");
 113                 goto ahalloc_fail;
 114         }
 115 
 116         /*
 117          * Allocate the software structure for tracking the address handle
 118          * (i.e. the Tavor Address Handle struct).  If we fail here, we must
 119          * undo the previous resource allocation.
 120          */
 121         status = tavor_rsrc_alloc(state, TAVOR_AHHDL, 1, sleepflag, &rsrc);
 122         if (status != DDI_SUCCESS) {
 123                 /* Set "status" and "errormsg" and goto failure */
 124                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed AH handler");
 125                 goto ahalloc_fail1;
 126         }
 127         ah = (tavor_ahhdl_t)rsrc->tr_addr;
 128 
 129         /* Increment the reference count on the protection domain (PD) */
 130         tavor_pd_refcnt_inc(pd);
 131 
 132         /*
 133          * Fill in the UDAV entry.  Note: We are only filling in a temporary
 134          * copy here, which we will later copy into the actual entry in
 135          * Tavor DDR memory.  This starts be zeroing out the temporary copy
 136          * and then calling tavor_set_addr_path() to fill in the common
 137          * portions that can be pulled from the "ibt_adds_vect_t" passed in
 138          */
 139         bzero(&udav_entry, sizeof (tavor_hw_udav_t));
 140         status = tavor_set_addr_path(state, attr_p,
 141             (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
 142         if (status != DDI_SUCCESS) {
 143                 tavor_pd_refcnt_dec(pd);
 144                 tavor_rsrc_free(state, &rsrc);
 145                 tavor_rsrc_free(state, &udav);
 146                 /* Set "status" and "errormsg" and goto failure */
 147                 TAVOR_TNF_FAIL(status, "failed in tavor_set_addr_path");
 148                 goto ahalloc_fail;
 149         }
 150         udav_entry.pd     = pd->pd_pdnum;
 151         udav_entry.msg_sz = state->ts_cfg_profile->cp_max_mtu - 1;
 152 
 153         /*
 154          * Register the memory for the UDAV.  The memory for the UDAV must
 155          * be registered in the Tavor TPT tables.  This gives us the LKey
 156          * that we will need when we later post a UD work request that
 157          * uses this address handle.
 158          * We might be able to pre-register all the memory for the UDAV XXX
 159          */
 160         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
 161         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)udav->tr_addr;
 162         mr_attr.mr_len   = udav->tr_len;
 163         mr_attr.mr_as    = NULL;
 164         mr_attr.mr_flags = flag;
 165         op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
 166         op.mro_bind_dmahdl = NULL;
 167         op.mro_bind_override_addr = 0;
 168         status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
 169         if (status != DDI_SUCCESS) {
 170                 /* Set "status" and "errormsg" and goto failure */
 171                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 172                 goto ahalloc_fail2;
 173         }
 174 
 175         /*
 176          * Fill in the UDAV entry.  Here we copy all the information from
 177          * the temporary UDAV into the DDR memory for the real UDAV entry.
 178          * Note that we copy everything but the first 64-bit word.  This
 179          * is where the PD number for the address handle resides.
 180          * By filling everything except the PD and then writing the PD in
 181          * a separate step below, we can ensure that the UDAV is not
 182          * accessed while there are partially written values in it (something
 183          * which really should not happen anyway).  This is guaranteed
 184          * because we take measures to ensure that the PD number is zero for
 185          * all unused UDAV (and because PD#0 is reserved for Tavor).
 186          */
 187         size = sizeof (tavor_hw_udav_t) >> 3;
 188         for (i = 1; i < size; i++) {
 189                 data = ((uint64_t *)&udav_entry)[i];
 190                 ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
 191                     data);
 192         }
 193         data = ((uint64_t *)&udav_entry)[0];
 194         ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, data);
 195 
 196         /*
 197          * Fill in the rest of the Tavor Address Handle struct.  Having
 198          * successfully copied the UDAV into the hardware, we update the
 199          * following fields for use in further operations on the AH.
 200          *
 201          * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
 202          * here because we may need to return it later to the IBTF (as a
 203          * result of a subsequent query operation).  Unlike the other UDAV
 204          * parameters, the value of "av_dgid.gid_guid" is not always preserved
 205          * by being written to hardware.  The reason for this is described in
 206          * tavor_set_addr_path().
 207          */
 208         ah->ah_udavrsrcp = udav;
 209         ah->ah_rsrcp  = rsrc;
 210         ah->ah_pdhdl  = pd;
 211         ah->ah_mrhdl  = mr;
 212         ah->ah_save_guid = attr_p->av_dgid.gid_guid;
 213         ah->ah_save_srate = attr_p->av_srate;
 214         *ahhdl = ah;
 215 
 216         /* Determine if later ddi_dma_sync will be necessary */
 217         ah->ah_sync = TAVOR_UDAV_IS_SYNC_REQ(state);
 218 
 219         /* Sync the UDAV for use by the hardware */
 220         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 221 
 222         TAVOR_TNF_EXIT(tavor_ah_alloc);
 223         return (DDI_SUCCESS);
 224 
 225 ahalloc_fail2:
 226         tavor_pd_refcnt_dec(pd);
 227         tavor_rsrc_free(state, &rsrc);
 228 ahalloc_fail1:
 229         tavor_rsrc_free(state, &udav);
 230 ahalloc_fail:
 231         TNF_PROBE_1(tavor_ah_alloc_fail, TAVOR_TNF_ERROR, "",
 232             tnf_string, msg, errormsg);
 233         TAVOR_TNF_EXIT(tavor_ah_alloc);
 234         return (status);
 235 }
 236 
 237 
 238 /*
 239  * tavor_ah_free()
 240  *    Context: Can be called only from user or kernel context.
 241  */
 242 /* ARGSUSED */
 243 int
 244 tavor_ah_free(tavor_state_t *state, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
 245 {
 246         tavor_rsrc_t            *udav, *rsrc;
 247         tavor_pdhdl_t           pd;
 248         tavor_mrhdl_t           mr;
 249         tavor_ahhdl_t           ah;
 250         int                     status;
 251 
 252         TAVOR_TNF_ENTER(tavor_ah_free);
 253 
 254         /*
 255          * Pull all the necessary information from the Tavor Address Handle
 256          * struct.  This is necessary here because the resource for the
 257          * AH is going to be freed up as part of this operation.
 258          */
 259         ah    = *ahhdl;
 260         mutex_enter(&ah->ah_lock);
 261         udav  = ah->ah_udavrsrcp;
 262         rsrc  = ah->ah_rsrcp;
 263         pd    = ah->ah_pdhdl;
 264         mr    = ah->ah_mrhdl;
 265         mutex_exit(&ah->ah_lock);
 266 
 267         /*
 268          * Deregister the memory for the UDAV.  If this fails for any reason,
 269          * then it is an indication that something (either in HW or SW) has
 270          * gone seriously wrong.  So we print a warning message and return
 271          * failure.
 272          */
 273         status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 274             sleepflag);
 275         if (status != DDI_SUCCESS) {
 276                 TNF_PROBE_0(tavor_ah_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
 277                 TAVOR_TNF_EXIT(tavor_ah_free);
 278                 return (ibc_get_ci_failure(0));
 279         }
 280 
 281         /*
 282          * Write zero to the first 64-bit word in the UDAV entry.  As
 283          * described above (in tavor_ah_alloc), the PD number is stored in
 284          * the first 64-bits of each UDAV and setting this to zero is
 285          * guaranteed to invalidate the entry.
 286          */
 287         ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, 0);
 288 
 289         /* Sync the UDAV for use by the hardware */
 290         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 291 
 292         /* Decrement the reference count on the protection domain (PD) */
 293         tavor_pd_refcnt_dec(pd);
 294 
 295         /* Free the Tavor Address Handle structure */
 296         tavor_rsrc_free(state, &rsrc);
 297 
 298         /* Free up the UDAV entry resource */
 299         tavor_rsrc_free(state, &udav);
 300 
 301         /* Set the ahhdl pointer to NULL and return success */
 302         *ahhdl = NULL;
 303 
 304         TAVOR_TNF_EXIT(tavor_ah_free);
 305         return (DDI_SUCCESS);
 306 }
 307 
 308 
 309 /*
 310  * tavor_ah_query()
 311  *    Context: Can be called from interrupt or base context.
 312  */
 313 /* ARGSUSED */
 314 int
 315 tavor_ah_query(tavor_state_t *state, tavor_ahhdl_t ah, tavor_pdhdl_t *pd,
 316     ibt_adds_vect_t *attr_p)
 317 {
 318         tavor_hw_udav_t         udav_entry;
 319         tavor_rsrc_t            *udav;
 320         uint64_t                data;
 321         uint32_t                size;
 322         int                     i;
 323 
 324         TAVOR_TNF_ENTER(tavor_ah_query);
 325 
 326         mutex_enter(&ah->ah_lock);
 327 
 328         /*
 329          * Pull all the necessary information from the Tavor Address Handle
 330          * structure
 331          */
 332         udav    = ah->ah_udavrsrcp;
 333         *pd     = ah->ah_pdhdl;
 334 
 335         /*
 336          * Copy the UDAV entry into the temporary copy.  Here we copy all
 337          * the information from the UDAV entry in DDR memory into the
 338          * temporary UDAV.  Note:  We don't need to sync the UDAV for
 339          * reading by software because Tavor HW never modifies the entry.
 340          */
 341         size = sizeof (tavor_hw_udav_t) >> 3;
 342         for (i = 0; i < size; i++) {
 343                 data = ddi_get64(udav->tr_acchdl,
 344                     ((uint64_t *)udav->tr_addr + i));
 345                 ((uint64_t *)&udav_entry)[i] = data;
 346         }
 347 
 348         /*
 349          * Fill in "ibt_adds_vect_t".  We call tavor_get_addr_path() to fill
 350          * the common portions that can be pulled from the UDAV we pass in.
 351          *
 352          * NOTE: We will also fill the "av_dgid.gid_guid" field from the
 353          * "ah_save_guid" field we have previously saved away.  The reason
 354          * for this is described in tavor_ah_alloc() and tavor_ah_modify().
 355          */
 356         tavor_get_addr_path(state, (tavor_hw_addr_path_t *)&udav_entry,
 357             attr_p, TAVOR_ADDRPATH_UDAV, NULL);
 358 
 359         attr_p->av_dgid.gid_guid = ah->ah_save_guid;
 360         attr_p->av_srate = ah->ah_save_srate;
 361 
 362         mutex_exit(&ah->ah_lock);
 363         TAVOR_TNF_EXIT(tavor_ah_query);
 364         return (DDI_SUCCESS);
 365 }
 366 
 367 
 368 /*
 369  * tavor_ah_modify()
 370  *    Context: Can be called from interrupt or base context.
 371  */
 372 /* ARGSUSED */
 373 int
 374 tavor_ah_modify(tavor_state_t *state, tavor_ahhdl_t ah,
 375     ibt_adds_vect_t *attr_p)
 376 {
 377         tavor_hw_udav_t         udav_entry;
 378         tavor_rsrc_t            *udav;
 379         uint64_t                data_new, data_old;
 380         uint32_t                udav_pd, size, portnum_new;
 381         int                     i, status;
 382 
 383         TAVOR_TNF_ENTER(tavor_ah_modify);
 384 
 385         /* Validate that specified port number is legal */
 386         if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
 387                 TNF_PROBE_1(tavor_ah_modify_inv_portnum,
 388                     TAVOR_TNF_ERROR, "", tnf_uint, port, attr_p->av_port_num);
 389                 TAVOR_TNF_EXIT(tavor_ah_modify);
 390                 return (IBT_HCA_PORT_INVALID);
 391         }
 392 
 393         mutex_enter(&ah->ah_lock);
 394 
 395         /*
 396          * Pull all the necessary information from the Tavor Address Handle
 397          * structure
 398          */
 399         udav = ah->ah_udavrsrcp;
 400 
 401         /*
 402          * Fill in the UDAV entry.  Note: we are only filling in a temporary
 403          * copy here, which we will later copy into the actual entry in
 404          * Tavor DDR memory.  This starts be zeroing out the temporary copy
 405          * and then calling tavor_set_addr_path() to fill in the common
 406          * portions that can be pulled from the "ibt_adds_vect_t" passed in
 407          *
 408          * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
 409          * field here (just as we did during tavor_ah_alloc()) because we
 410          * may need to return it later to the IBTF (as a result of a
 411          * subsequent query operation).  As explained in tavor_ah_alloc(),
 412          * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
 413          * is not always preserved by being written to hardware.  The reason
 414          * for this is described in tavor_set_addr_path().
 415          */
 416         bzero(&udav_entry, sizeof (tavor_hw_udav_t));
 417         status = tavor_set_addr_path(state, attr_p,
 418             (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
 419         if (status != DDI_SUCCESS) {
 420                 mutex_exit(&ah->ah_lock);
 421                 TNF_PROBE_0(tavor_ah_modify_setaddrpath_fail,
 422                     TAVOR_TNF_ERROR, "");
 423                 TAVOR_TNF_EXIT(tavor_ah_modify);
 424                 return (status);
 425         }
 426         ah->ah_save_guid = attr_p->av_dgid.gid_guid;
 427         ah->ah_save_srate = attr_p->av_srate;
 428 
 429         /*
 430          * Save away the current PD number for this UDAV.  Then temporarily
 431          * invalidate the entry (by setting the PD to zero).  Note:  Since
 432          * the first 32 bits of the UDAV actually contain the current port
 433          * number _and_ current PD number, we need to mask off some bits.
 434          */
 435         udav_pd = ddi_get32(udav->tr_acchdl, (uint32_t *)udav->tr_addr);
 436         udav_pd = udav_pd & 0xFFFFFF;
 437         ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, 0);
 438 
 439         /* Sync the UDAV for use by the hardware */
 440         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 441 
 442         /*
 443          * Copy UDAV structure to the entry
 444          *    Note:  We copy in 64-bit chunks.  For the first two of these
 445          *    chunks it is necessary to read the current contents of the
 446          *    UDAV, mask off the modifiable portions (maintaining any
 447          *    of the "reserved" portions), and then mask on the new data.
 448          */
 449         size = sizeof (tavor_hw_udav_t) >> 3;
 450         for (i = 0; i < size; i++) {
 451                 data_new = ((uint64_t *)&udav_entry)[i];
 452                 data_old = ddi_get64(udav->tr_acchdl,
 453                     ((uint64_t *)udav->tr_addr + i));
 454 
 455                 /*
 456                  * Apply mask to change only the relevant values.  Note: We
 457                  * extract the new portnum from the address handle here
 458                  * because the "PD" and "portnum" fields are in the same
 459                  * 32-bit word in the UDAV.  We will use the (new) port
 460                  * number extracted here when we write the valid PD number
 461                  * in the last step below.
 462                  */
 463                 if (i == 0) {
 464                         data_old = data_old & TAVOR_UDAV_MODIFY_MASK0;
 465                         portnum_new = data_new >> 56;
 466                 } else if (i == 1) {
 467                         data_old = data_old & TAVOR_UDAV_MODIFY_MASK1;
 468                 } else {
 469                         data_old = 0;
 470                 }
 471 
 472                 /* Write the updated values to the UDAV (in DDR) */
 473                 data_new = data_old | data_new;
 474                 ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
 475                     data_new);
 476         }
 477 
 478         /*
 479          * Sync the body of the UDAV for use by the hardware.  After we
 480          * have updated the PD number (to make the UDAV valid), we sync
 481          * again to push the entire entry out for hardware access.
 482          */
 483         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 484 
 485         /*
 486          * Put the valid PD number back into UDAV entry.  Note: Because port
 487          * number and PD number are in the same word, we must mask the
 488          * new port number with the old PD number before writing it back
 489          * to the UDAV entry
 490          */
 491         udav_pd = ((portnum_new << 24) | udav_pd);
 492         ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, udav_pd);
 493 
 494         /* Sync the rest of the UDAV for use by the hardware */
 495         tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
 496 
 497         mutex_exit(&ah->ah_lock);
 498         TAVOR_TNF_EXIT(tavor_ah_modify);
 499         return (DDI_SUCCESS);
 500 }
 501 
 502 
 503 /*
 504  * tavor_udav_sync()
 505  *    Context: Can be called from interrupt or base context.
 506  */
 507 /* ARGSUSED */
 508 static void
 509 tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, uint_t flag)
 510 {
 511         ddi_dma_handle_t        dmahdl;
 512         off_t                   offset;
 513         int                     status;
 514 
 515         TAVOR_TNF_ENTER(tavor_udav_sync);
 516 
 517         /* Determine if AH needs to be synced or not */
 518         if (ah->ah_sync == 0) {
 519                 TAVOR_TNF_EXIT(tavor_udav_sync);
 520                 return;
 521         }
 522 
 523         /* Get the DMA handle from AH handle */
 524         dmahdl = ah->ah_mrhdl->mr_bindinfo.bi_dmahdl;
 525 
 526         /* Calculate offset into address handle */
 527         offset = (off_t)0;
 528         status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_udav_t), flag);
 529         if (status != DDI_SUCCESS) {
 530                 TNF_PROBE_0(tavor_udav_sync_getnextentry_fail,
 531                     TAVOR_TNF_ERROR, "");
 532                 TAVOR_TNF_EXIT(tavor_udav_sync);
 533                 return;
 534         }
 535 
 536         TAVOR_TNF_EXIT(tavor_udav_sync);
 537 }
 538 
 539 
 540 /*
 541  * tavor_mcg_attach()
 542  *    Context: Can be called only from user or kernel context.
 543  */
 544 int
 545 tavor_mcg_attach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
 546     ib_lid_t lid)
 547 {
 548         tavor_rsrc_t            *rsrc;
 549         tavor_hw_mcg_t          *mcg_entry;
 550         tavor_hw_mcg_qp_list_t  *mcg_entry_qplist;
 551         tavor_mcghdl_t          mcg, newmcg;
 552         uint64_t                mgid_hash;
 553         uint32_t                end_indx;
 554         int                     status;
 555         uint_t                  qp_found;
 556         char                    *errormsg;
 557 
 558         TAVOR_TNF_ENTER(tavor_mcg_attach);
 559 
 560         /*
 561          * It is only allowed to attach MCG to UD queue pairs.  Verify
 562          * that the intended QP is of the appropriate transport type
 563          */
 564         if (qp->qp_serv_type != TAVOR_QP_UD) {
 565                 /* Set "status" and "errormsg" and goto failure */
 566                 TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid service type");
 567                 goto mcgattach_fail;
 568         }
 569 
 570         /*
 571          * Check for invalid Multicast DLID.  Specifically, all Multicast
 572          * LIDs should be within a well defined range.  If the specified LID
 573          * is outside of that range, then return an error.
 574          */
 575         if (tavor_mlid_is_valid(lid) == 0) {
 576                 /* Set "status" and "errormsg" and goto failure */
 577                 TAVOR_TNF_FAIL(IBT_MC_MLID_INVALID, "invalid MLID");
 578                 goto mcgattach_fail;
 579         }
 580         /*
 581          * Check for invalid Multicast GID.  All Multicast GIDs should have
 582          * a well-defined pattern of bits and flags that are allowable.  If
 583          * the specified GID does not meet the criteria, then return an error.
 584          */
 585         if (tavor_mgid_is_valid(gid) == 0) {
 586                 /* Set "status" and "errormsg" and goto failure */
 587                 TAVOR_TNF_FAIL(IBT_MC_MGID_INVALID, "invalid MGID");
 588                 goto mcgattach_fail;
 589         }
 590 
 591         /*
 592          * Compute the MGID hash value.  Since the MCG table is arranged as
 593          * a number of separate hash chains, this operation converts the
 594          * specified MGID into the starting index of an entry in the hash
 595          * table (i.e. the index for the start of the appropriate hash chain).
 596          * Subsequent operations below will walk the chain searching for the
 597          * right place to add this new QP.
 598          */
 599         status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
 600             &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
 601         if (status != TAVOR_CMD_SUCCESS) {
 602                 cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
 603                     status);
 604                 TNF_PROBE_1(tavor_mcg_attach_mgid_hash_cmd_fail,
 605                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
 606                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 607                 return (ibc_get_ci_failure(0));
 608         }
 609 
 610         /*
 611          * Grab the multicast group mutex.  Then grab the pre-allocated
 612          * temporary buffer used for holding and/or modifying MCG entries.
 613          * Zero out the temporary MCG entry before we begin.
 614          */
 615         mutex_enter(&state->ts_mcglock);
 616         mcg_entry = state->ts_mcgtmp;
 617         mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
 618         bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
 619 
 620         /*
 621          * Walk through the array of MCG entries starting at "mgid_hash".
 622          * Try to find the appropriate place for this new QP to be added.
 623          * This could happen when the first entry of the chain has MGID == 0
 624          * (which means that the hash chain is empty), or because we find
 625          * an entry with the same MGID (in which case we'll add the QP to
 626          * that MCG), or because we come to the end of the chain (in which
 627          * case this is the first QP being added to the multicast group that
 628          * corresponds to the MGID.  The tavor_mcg_walk_mgid_hash() routine
 629          * walks the list and returns an index into the MCG table.  The entry
 630          * at this index is then checked to determine which case we have
 631          * fallen into (see below).  Note:  We are using the "shadow" MCG
 632          * list (of tavor_mcg_t structs) for this lookup because the real
 633          * MCG entries are in hardware (and the lookup process would be much
 634          * more time consuming).
 635          */
 636         end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
 637         mcg      = &state->ts_mcghdl[end_indx];
 638 
 639         /*
 640          * If MGID == 0, then the hash chain is empty.  Just fill in the
 641          * current entry.  Note:  No need to allocate an MCG table entry
 642          * as all the hash chain "heads" are already preallocated.
 643          */
 644         if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
 645 
 646                 /* Fill in the current entry in the "shadow" MCG list */
 647                 tavor_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
 648 
 649                 /*
 650                  * Try to add the new QP number to the list.  This (and the
 651                  * above) routine fills in a temporary MCG.  The "mcg_entry"
 652                  * and "mcg_entry_qplist" pointers simply point to different
 653                  * offsets within the same temporary copy of the MCG (for
 654                  * convenience).  Note:  If this fails, we need to invalidate
 655                  * the entries we've already put into the "shadow" list entry
 656                  * above.
 657                  */
 658                 status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
 659                     &qp_found);
 660                 if (status != DDI_SUCCESS) {
 661                         bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
 662                         mutex_exit(&state->ts_mcglock);
 663                         /* Set "status" and "errormsg" and goto failure */
 664                         TAVOR_TNF_FAIL(status, "failed qplist add");
 665                         goto mcgattach_fail;
 666                 }
 667 
 668                 /*
 669                  * Once the temporary MCG has been filled in, write the entry
 670                  * into the appropriate location in the Tavor MCG entry table.
 671                  * If it's successful, then drop the lock and return success.
 672                  * Note: In general, this operation shouldn't fail.  If it
 673                  * does, then it is an indication that something (probably in
 674                  * HW, but maybe in SW) has gone seriously wrong.  We still
 675                  * want to zero out the entries that we've filled in above
 676                  * (in the tavor_mcg_setup_new_hdr() routine).
 677                  */
 678                 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
 679                     TAVOR_CMD_NOSLEEP_SPIN);
 680                 if (status != TAVOR_CMD_SUCCESS) {
 681                         bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
 682                         mutex_exit(&state->ts_mcglock);
 683                         TAVOR_WARNING(state, "failed to write MCG entry");
 684                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
 685                             "%08x\n", status);
 686                         TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 687                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 688                             tnf_uint, indx, end_indx);
 689                         TAVOR_TNF_EXIT(tavor_mcg_attach);
 690                         return (ibc_get_ci_failure(0));
 691                 }
 692 
 693                 /*
 694                  * Now that we know all the Tavor firmware accesses have been
 695                  * successful, we update the "shadow" MCG entry by incrementing
 696                  * the "number of attached QPs" count.
 697                  *
 698                  * We increment only if the QP is not already part of the
 699                  * MCG by checking the 'qp_found' flag returned from the
 700                  * qplist_add above.
 701                  */
 702                 if (!qp_found) {
 703                         mcg->mcg_num_qps++;
 704 
 705                         /*
 706                          * Increment the refcnt for this QP.  Because the QP
 707                          * was added to this MCG, the refcnt must be
 708                          * incremented.
 709                          */
 710                         tavor_qp_mcg_refcnt_inc(qp);
 711                 }
 712 
 713                 /*
 714                  * We drop the lock and return success.
 715                  */
 716                 mutex_exit(&state->ts_mcglock);
 717                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 718                 return (DDI_SUCCESS);
 719         }
 720 
 721         /*
 722          * If the specified MGID matches the MGID in the current entry, then
 723          * we need to try to add the QP to the current MCG entry.  In this
 724          * case, it means that we need to read the existing MCG entry (into
 725          * the temporary MCG), add the new QP number to the temporary entry
 726          * (using the same method we used above), and write the entry back
 727          * to the hardware (same as above).
 728          */
 729         if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
 730             (mcg->mcg_mgid_l == gid.gid_guid)) {
 731 
 732                 /*
 733                  * Read the current MCG entry into the temporary MCG.  Note:
 734                  * In general, this operation shouldn't fail.  If it does,
 735                  * then it is an indication that something (probably in HW,
 736                  * but maybe in SW) has gone seriously wrong.
 737                  */
 738                 status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
 739                     TAVOR_CMD_NOSLEEP_SPIN);
 740                 if (status != TAVOR_CMD_SUCCESS) {
 741                         mutex_exit(&state->ts_mcglock);
 742                         TAVOR_WARNING(state, "failed to read MCG entry");
 743                         cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
 744                             "%08x\n", status);
 745                         TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
 746                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 747                             tnf_uint, indx, end_indx);
 748                         TAVOR_TNF_EXIT(tavor_mcg_attach);
 749                         return (ibc_get_ci_failure(0));
 750                 }
 751 
 752                 /*
 753                  * Try to add the new QP number to the list.  This routine
 754                  * fills in the necessary pieces of the temporary MCG.  The
 755                  * "mcg_entry_qplist" pointer is used to point to the portion
 756                  * of the temporary MCG that holds the QP numbers.
 757                  *
 758                  * Note: tavor_mcg_qplist_add() returns SUCCESS if it
 759                  * already found the QP in the list.  In this case, the QP is
 760                  * not added on to the list again.  Check the flag 'qp_found'
 761                  * if this value is needed to be known.
 762                  *
 763                  */
 764                 status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
 765                     &qp_found);
 766                 if (status != DDI_SUCCESS) {
 767                         mutex_exit(&state->ts_mcglock);
 768                         /* Set "status" and "errormsg" and goto failure */
 769                         TAVOR_TNF_FAIL(status, "failed qplist add");
 770                         goto mcgattach_fail;
 771                 }
 772 
 773                 /*
 774                  * Once the temporary MCG has been updated, write the entry
 775                  * into the appropriate location in the Tavor MCG entry table.
 776                  * If it's successful, then drop the lock and return success.
 777                  * Note: In general, this operation shouldn't fail.  If it
 778                  * does, then it is an indication that something (probably in
 779                  * HW, but maybe in SW) has gone seriously wrong.
 780                  */
 781                 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
 782                     TAVOR_CMD_NOSLEEP_SPIN);
 783                 if (status != TAVOR_CMD_SUCCESS) {
 784                         mutex_exit(&state->ts_mcglock);
 785                         TAVOR_WARNING(state, "failed to write MCG entry");
 786                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
 787                             "%08x\n", status);
 788                         TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 789                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 790                             tnf_uint, indx, end_indx);
 791                         TAVOR_TNF_EXIT(tavor_mcg_attach);
 792                         return (ibc_get_ci_failure(0));
 793                 }
 794 
 795                 /*
 796                  * Now that we know all the Tavor firmware accesses have been
 797                  * successful, we update the current "shadow" MCG entry by
 798                  * incrementing the "number of attached QPs" count.
 799                  *
 800                  * We increment only if the QP is not already part of the
 801                  * MCG by checking the 'qp_found' flag returned from the
 802                  * qplist_add above.
 803                  */
 804                 if (!qp_found) {
 805                         mcg->mcg_num_qps++;
 806 
 807                         /*
 808                          * Increment the refcnt for this QP.  Because the QP
 809                          * was added to this MCG, the refcnt must be
 810                          * incremented.
 811                          */
 812                         tavor_qp_mcg_refcnt_inc(qp);
 813                 }
 814 
 815                 /*
 816                  * We drop the lock and return success.
 817                  */
 818                 mutex_exit(&state->ts_mcglock);
 819                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 820                 return (DDI_SUCCESS);
 821         }
 822 
 823         /*
 824          * If we've reached here, then we're at the end of the hash chain.
 825          * We need to allocate a new MCG entry, fill it in, write it to Tavor,
 826          * and update the previous entry to link the new one to the end of the
 827          * chain.
 828          */
 829 
 830         /*
 831          * Allocate an MCG table entry.  This will be filled in with all
 832          * the necessary parameters to define the multicast group.  Then it
 833          * will be written to the hardware in the next-to-last step below.
 834          */
 835         status = tavor_rsrc_alloc(state, TAVOR_MCG, 1, TAVOR_NOSLEEP, &rsrc);
 836         if (status != DDI_SUCCESS) {
 837                 mutex_exit(&state->ts_mcglock);
 838                 /* Set "status" and "errormsg" and goto failure */
 839                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MCG");
 840                 goto mcgattach_fail;
 841         }
 842 
 843         /*
 844          * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
 845          * it does above, tavor_mcg_setup_new_hdr() also fills in a portion
 846          * of the temporary MCG entry (the rest of which will be filled in by
 847          * tavor_mcg_qplist_add() below)
 848          */
 849         newmcg = &state->ts_mcghdl[rsrc->tr_indx];
 850         tavor_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
 851 
 852         /*
 853          * Try to add the new QP number to the list.  This routine fills in
 854          * the final necessary pieces of the temporary MCG.  The
 855          * "mcg_entry_qplist" pointer is used to point to the portion of the
 856          * temporary MCG that holds the QP numbers.  If we fail here, we
 857          * must undo the previous resource allocation.
 858          *
 859          * Note: tavor_mcg_qplist_add() can we return SUCCESS if it already
 860          * found the QP in the list.  In this case, the QP is not added on to
 861          * the list again.  Check the flag 'qp_found' if this value is needed
 862          * to be known.
 863          */
 864         status = tavor_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
 865             &qp_found);
 866         if (status != DDI_SUCCESS) {
 867                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 868                 tavor_rsrc_free(state, &rsrc);
 869                 mutex_exit(&state->ts_mcglock);
 870                 /* Set "status" and "errormsg" and goto failure */
 871                 TAVOR_TNF_FAIL(status, "failed qplist add");
 872                 goto mcgattach_fail;
 873         }
 874 
 875         /*
 876          * Once the temporary MCG has been updated, write the entry into the
 877          * appropriate location in the Tavor MCG entry table.  If this is
 878          * successful, then we need to chain the previous entry to this one.
 879          * Note: In general, this operation shouldn't fail.  If it does, then
 880          * it is an indication that something (probably in HW, but maybe in
 881          * SW) has gone seriously wrong.
 882          */
 883         status = tavor_write_mgm_cmd_post(state, mcg_entry, rsrc->tr_indx,
 884             TAVOR_CMD_NOSLEEP_SPIN);
 885         if (status != TAVOR_CMD_SUCCESS) {
 886                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 887                 tavor_rsrc_free(state, &rsrc);
 888                 mutex_exit(&state->ts_mcglock);
 889                 TAVOR_WARNING(state, "failed to write MCG entry");
 890                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
 891                     status);
 892                 TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 893                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 894                     tnf_uint, indx, rsrc->tr_indx);
 895                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 896                 return (ibc_get_ci_failure(0));
 897         }
 898 
 899         /*
 900          * Now read the current MCG entry (the one previously at the end of
 901          * hash chain) into the temporary MCG.  We are going to update its
 902          * "next_gid_indx" now and write the entry back to the MCG table.
 903          * Note:  In general, this operation shouldn't fail.  If it does, then
 904          * it is an indication that something (probably in HW, but maybe in SW)
 905          * has gone seriously wrong.  We will free up the MCG entry resource,
 906          * but we will not undo the previously written MCG entry in the HW.
 907          * This is OK, though, because the MCG entry is not currently attached
 908          * to any hash chain.
 909          */
 910         status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
 911             TAVOR_CMD_NOSLEEP_SPIN);
 912         if (status != TAVOR_CMD_SUCCESS) {
 913                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 914                 tavor_rsrc_free(state, &rsrc);
 915                 mutex_exit(&state->ts_mcglock);
 916                 TAVOR_WARNING(state, "failed to read MCG entry");
 917                 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
 918                     status);
 919                 TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
 920                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 921                     tnf_uint, indx, end_indx);
 922                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 923                 return (ibc_get_ci_failure(0));
 924         }
 925 
 926         /*
 927          * Finally, we update the "next_gid_indx" field in the temporary MCG
 928          * and attempt to write the entry back into the Tavor MCG table.  If
 929          * this succeeds, then we update the "shadow" list to reflect the
 930          * change, drop the lock, and return success.  Note:  In general, this
 931          * operation shouldn't fail.  If it does, then it is an indication
 932          * that something (probably in HW, but maybe in SW) has gone seriously
 933          * wrong.  Just as we do above, we will free up the MCG entry resource,
 934          * but we will not try to undo the previously written MCG entry.  This
 935          * is OK, though, because (since we failed here to update the end of
 936          * the chain) that other entry is not currently attached to any chain.
 937          */
 938         mcg_entry->next_gid_indx = rsrc->tr_indx;
 939         status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
 940             TAVOR_CMD_NOSLEEP_SPIN);
 941         if (status != TAVOR_CMD_SUCCESS) {
 942                 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
 943                 tavor_rsrc_free(state, &rsrc);
 944                 mutex_exit(&state->ts_mcglock);
 945                 TAVOR_WARNING(state, "failed to write MCG entry");
 946                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
 947                     status);
 948                 TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
 949                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
 950                     tnf_uint, indx, end_indx);
 951                 TAVOR_TNF_EXIT(tavor_mcg_attach);
 952                 return (ibc_get_ci_failure(0));
 953         }
 954         mcg = &state->ts_mcghdl[end_indx];
 955         mcg->mcg_next_indx = rsrc->tr_indx;
 956 
 957         /*
 958          * Now that we know all the Tavor firmware accesses have been
 959          * successful, we update the new "shadow" MCG entry by incrementing
 960          * the "number of attached QPs" count.  Then we drop the lock and
 961          * return success.
 962          */
 963         newmcg->mcg_num_qps++;
 964 
 965         /*
 966          * Increment the refcnt for this QP.  Because the QP
 967          * was added to this MCG, the refcnt must be
 968          * incremented.
 969          */
 970         tavor_qp_mcg_refcnt_inc(qp);
 971 
 972         mutex_exit(&state->ts_mcglock);
 973         TAVOR_TNF_EXIT(tavor_mcg_attach);
 974         return (DDI_SUCCESS);
 975 
 976 mcgattach_fail:
 977         TNF_PROBE_1(tavor_mcg_attach_fail, TAVOR_TNF_ERROR, "", tnf_string,
 978             msg, errormsg);
 979         TAVOR_TNF_EXIT(tavor_mcg_attach);
 980         return (status);
 981 }
 982 
 983 
 984 /*
 985  * tavor_mcg_detach()
 986  *    Context: Can be called only from user or kernel context.
 987  */
 988 int
 989 tavor_mcg_detach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
 990     ib_lid_t lid)
 991 {
 992         tavor_hw_mcg_t          *mcg_entry;
 993         tavor_hw_mcg_qp_list_t  *mcg_entry_qplist;
 994         tavor_mcghdl_t          mcg;
 995         uint64_t                mgid_hash;
 996         uint32_t                end_indx, prev_indx;
 997         int                     status;
 998 
 999         TAVOR_TNF_ENTER(tavor_mcg_detach);
1000 
1001         /*
1002          * Check for invalid Multicast DLID.  Specifically, all Multicast
1003          * LIDs should be within a well defined range.  If the specified LID
1004          * is outside of that range, then return an error.
1005          */
1006         if (tavor_mlid_is_valid(lid) == 0) {
1007                 TNF_PROBE_0(tavor_mcg_detach_invmlid_fail, TAVOR_TNF_ERROR, "");
1008                 TAVOR_TNF_EXIT(tavor_mcg_detach);
1009                 return (IBT_MC_MLID_INVALID);
1010         }
1011 
1012         /*
1013          * Compute the MGID hash value.  As described above, the MCG table is
1014          * arranged as a number of separate hash chains.  This operation
1015          * converts the specified MGID into the starting index of an entry in
1016          * the hash table (i.e. the index for the start of the appropriate
1017          * hash chain).  Subsequent operations below will walk the chain
1018          * searching for a matching entry from which to attempt to remove
1019          * the specified QP.
1020          */
1021         status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1022             &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
1023         if (status != TAVOR_CMD_SUCCESS) {
1024                 cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
1025                     status);
1026                 TNF_PROBE_1(tavor_mcg_detach_mgid_hash_cmd_fail,
1027                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1028                 TAVOR_TNF_EXIT(tavor_mcg_attach);
1029                 return (ibc_get_ci_failure(0));
1030         }
1031 
1032         /*
1033          * Grab the multicast group mutex.  Then grab the pre-allocated
1034          * temporary buffer used for holding and/or modifying MCG entries.
1035          */
1036         mutex_enter(&state->ts_mcglock);
1037         mcg_entry = state->ts_mcgtmp;
1038         mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
1039 
1040         /*
1041          * Walk through the array of MCG entries starting at "mgid_hash".
1042          * Try to find an MCG entry with a matching MGID.  The
1043          * tavor_mcg_walk_mgid_hash() routine walks the list and returns an
1044          * index into the MCG table.  The entry at this index is checked to
1045          * determine whether it is a match or not.  If it is a match, then
1046          * we continue on to attempt to remove the QP from the MCG.  If it
1047          * is not a match (or not a valid MCG entry), then we return an error.
1048          */
1049         end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1050         mcg      = &state->ts_mcghdl[end_indx];
1051 
1052         /*
1053          * If MGID == 0 (the hash chain is empty) or if the specified MGID
1054          * does not match the MGID in the current entry, then return
1055          * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1056          * valid).
1057          */
1058         if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1059             ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1060             (mcg->mcg_mgid_l != gid.gid_guid))) {
1061                 mutex_exit(&state->ts_mcglock);
1062                 TNF_PROBE_0(tavor_mcg_detach_invmgid_fail, TAVOR_TNF_ERROR, "");
1063                 TAVOR_TNF_EXIT(tavor_mcg_detach);
1064                 return (IBT_MC_MGID_INVALID);
1065         }
1066 
1067         /*
1068          * Read the current MCG entry into the temporary MCG.  Note: In
1069          * general, this operation shouldn't fail.  If it does, then it is
1070          * an indication that something (probably in HW, but maybe in SW)
1071          * has gone seriously wrong.
1072          */
1073         status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
1074             TAVOR_CMD_NOSLEEP_SPIN);
1075         if (status != TAVOR_CMD_SUCCESS) {
1076                 mutex_exit(&state->ts_mcglock);
1077                 TAVOR_WARNING(state, "failed to read MCG entry");
1078                 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1079                     status);
1080                 TNF_PROBE_2(tavor_mcg_detach_read_mgm_cmd_fail,
1081                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1082                     tnf_uint, indx, end_indx);
1083                 TAVOR_TNF_EXIT(tavor_mcg_attach);
1084                 return (ibc_get_ci_failure(0));
1085         }
1086 
1087         /*
1088          * Search the QP number list for a match.  If a match is found, then
1089          * remove the entry from the QP list.  Otherwise, if no match is found,
1090          * return an error.
1091          */
1092         status = tavor_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1093         if (status != DDI_SUCCESS) {
1094                 mutex_exit(&state->ts_mcglock);
1095                 TAVOR_TNF_EXIT(tavor_mcg_detach);
1096                 return (status);
1097         }
1098 
1099         /*
1100          * Decrement the MCG count for this QP.  When the 'qp_mcg'
1101          * field becomes 0, then this QP is no longer a member of any
1102          * MCG.
1103          */
1104         tavor_qp_mcg_refcnt_dec(qp);
1105 
1106         /*
1107          * If the current MCG's QP number list is about to be made empty
1108          * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1109          * chain.  Otherwise, just write the updated MCG entry back to the
1110          * hardware.  In either case, once we successfully update the hardware
1111          * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1112          * count (or zero out the entire "shadow" list entry) before returning
1113          * success.  Note:  Zeroing out the "shadow" list entry is done
1114          * inside of tavor_mcg_hash_list_remove().
1115          */
1116         if (mcg->mcg_num_qps == 1) {
1117 
1118                 /* Remove an MCG entry from the hash chain */
1119                 status = tavor_mcg_hash_list_remove(state, end_indx, prev_indx,
1120                     mcg_entry);
1121                 if (status != DDI_SUCCESS) {
1122                         mutex_exit(&state->ts_mcglock);
1123                         TAVOR_TNF_EXIT(tavor_mcg_detach);
1124                         return (status);
1125                 }
1126 
1127         } else {
1128                 /*
1129                  * Write the updated MCG entry back to the Tavor MCG table.
1130                  * If this succeeds, then we update the "shadow" list to
1131                  * reflect the change (i.e. decrement the "mcg_num_qps"),
1132                  * drop the lock, and return success.  Note:  In general,
1133                  * this operation shouldn't fail.  If it does, then it is an
1134                  * indication that something (probably in HW, but maybe in SW)
1135                  * has gone seriously wrong.
1136                  */
1137                 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
1138                     TAVOR_CMD_NOSLEEP_SPIN);
1139                 if (status != TAVOR_CMD_SUCCESS) {
1140                         mutex_exit(&state->ts_mcglock);
1141                         TAVOR_WARNING(state, "failed to write MCG entry");
1142                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1143                             "%08x\n", status);
1144                         TNF_PROBE_2(tavor_mcg_detach_write_mgm_cmd_fail,
1145                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1146                             tnf_uint, indx, end_indx);
1147                         TAVOR_TNF_EXIT(tavor_mcg_detach);
1148                         return (ibc_get_ci_failure(0));
1149                 }
1150                 mcg->mcg_num_qps--;
1151         }
1152 
1153         mutex_exit(&state->ts_mcglock);
1154         TAVOR_TNF_EXIT(tavor_mcg_detach);
1155         return (DDI_SUCCESS);
1156 }
1157 
1158 /*
1159  * tavor_qp_mcg_refcnt_inc()
1160  *    Context: Can be called from interrupt or base context.
1161  */
1162 static void
1163 tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp)
1164 {
1165         /* Increment the QP's MCG reference count */
1166         mutex_enter(&qp->qp_lock);
1167         qp->qp_mcg_refcnt++;
1168         TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_inc, TAVOR_TNF_TRACE, "",
1169             tnf_uint, refcnt, qp->qp_mcg_refcnt);
1170         mutex_exit(&qp->qp_lock);
1171 }
1172 
1173 
1174 /*
1175  * tavor_qp_mcg_refcnt_dec()
1176  *    Context: Can be called from interrupt or base context.
1177  */
1178 static void
1179 tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp)
1180 {
1181         /* Decrement the QP's MCG reference count */
1182         mutex_enter(&qp->qp_lock);
1183         qp->qp_mcg_refcnt--;
1184         TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_dec, TAVOR_TNF_TRACE, "",
1185             tnf_uint, refcnt, qp->qp_mcg_refcnt);
1186         mutex_exit(&qp->qp_lock);
1187 }
1188 
1189 
1190 /*
1191  * tavor_mcg_qplist_add()
1192  *    Context: Can be called from interrupt or base context.
1193  */
1194 static int
1195 tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
1196     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp,
1197     uint_t *qp_found)
1198 {
1199         uint_t          qplist_indx;
1200 
1201         TAVOR_TNF_ENTER(tavor_mcg_qplist_add);
1202 
1203         ASSERT(MUTEX_HELD(&state->ts_mcglock));
1204 
1205         qplist_indx = mcg->mcg_num_qps;
1206 
1207         /*
1208          * Determine if we have exceeded the maximum number of QP per
1209          * multicast group.  If we have, then return an error
1210          */
1211         if (qplist_indx >= state->ts_cfg_profile->cp_num_qp_per_mcg) {
1212                 TNF_PROBE_0(tavor_mcg_qplist_add_too_many_qps,
1213                     TAVOR_TNF_ERROR, "");
1214                 TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1215                 return (IBT_HCA_MCG_QP_EXCEEDED);
1216         }
1217 
1218         /*
1219          * Determine if the QP is already attached to this MCG table.  If it
1220          * is, then we break out and treat this operation as a NO-OP
1221          */
1222         for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1223             qplist_indx++) {
1224                 if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1225                         break;
1226                 }
1227         }
1228 
1229         /*
1230          * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1231          * return SUCCESS in this case, but the qplist will not have been
1232          * updated because the QP was already on the list.
1233          */
1234         if (qplist_indx < mcg->mcg_num_qps) {
1235                 *qp_found = 1;
1236         } else {
1237                 /*
1238                  * Otherwise, append the new QP number to the end of the
1239                  * current QP list.  Note: We will increment the "mcg_num_qps"
1240                  * field on the "shadow" MCG list entry later (after we know
1241                  * that all necessary Tavor firmware accesses have been
1242                  * successful).
1243                  *
1244                  * Set 'qp_found' to 0 so we know the QP was added on to the
1245                  * list for sure.
1246                  */
1247                 mcg_qplist[qplist_indx].q   = TAVOR_MCG_QPN_VALID;
1248                 mcg_qplist[qplist_indx].qpn = qp->qp_qpnum;
1249                 *qp_found = 0;
1250         }
1251 
1252         TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1253         return (DDI_SUCCESS);
1254 }
1255 
1256 
1257 
1258 /*
1259  * tavor_mcg_qplist_remove()
1260  *    Context: Can be called from interrupt or base context.
1261  */
1262 static int
1263 tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, tavor_hw_mcg_qp_list_t *mcg_qplist,
1264     tavor_qphdl_t qp)
1265 {
1266         uint_t          i, qplist_indx;
1267 
1268         TAVOR_TNF_ENTER(tavor_mcg_qplist_remove);
1269 
1270         /*
1271          * Search the MCG QP list for a matching QPN.  When
1272          * it's found, we swap the last entry with the current
1273          * one, set the last entry to zero, decrement the last
1274          * entry, and return.  If it's not found, then it's
1275          * and error.
1276          */
1277         qplist_indx = mcg->mcg_num_qps;
1278         for (i = 0; i < qplist_indx; i++) {
1279                 if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1280                         mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1281                         mcg_qplist[qplist_indx - 1].q = TAVOR_MCG_QPN_INVALID;
1282                         mcg_qplist[qplist_indx - 1].qpn = 0;
1283 
1284                         TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1285                         return (DDI_SUCCESS);
1286                 }
1287         }
1288 
1289         TNF_PROBE_0(tavor_mcg_qplist_remove_invqphdl_fail, TAVOR_TNF_ERROR, "");
1290         TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1291         return (IBT_QP_HDL_INVALID);
1292 }
1293 
1294 
1295 /*
1296  * tavor_mcg_walk_mgid_hash()
1297  *    Context: Can be called from interrupt or base context.
1298  */
1299 static uint_t
1300 tavor_mcg_walk_mgid_hash(tavor_state_t *state, uint64_t start_indx,
1301     ib_gid_t mgid, uint_t *p_indx)
1302 {
1303         tavor_mcghdl_t  curr_mcghdl;
1304         uint_t          curr_indx, prev_indx;
1305 
1306         TAVOR_TNF_ENTER(tavor_mcg_walk_mgid_hash);
1307 
1308         ASSERT(MUTEX_HELD(&state->ts_mcglock));
1309 
1310         /* Start at the head of the hash chain */
1311         curr_indx   = start_indx;
1312         prev_indx   = curr_indx;
1313         curr_mcghdl = &state->ts_mcghdl[curr_indx];
1314 
1315         /* If the first entry in the chain has MGID == 0, then stop */
1316         if ((curr_mcghdl->mcg_mgid_h == 0) &&
1317             (curr_mcghdl->mcg_mgid_l == 0)) {
1318                 goto end_mgid_hash_walk;
1319         }
1320 
1321         /* If the first entry in the chain matches the MGID, then stop */
1322         if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1323             (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1324                 goto end_mgid_hash_walk;
1325         }
1326 
1327         /* Otherwise, walk the hash chain looking for a match */
1328         while (curr_mcghdl->mcg_next_indx != 0) {
1329                 prev_indx = curr_indx;
1330                 curr_indx = curr_mcghdl->mcg_next_indx;
1331                 curr_mcghdl = &state->ts_mcghdl[curr_indx];
1332 
1333                 if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1334                     (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1335                         break;
1336                 }
1337         }
1338 
1339 end_mgid_hash_walk:
1340         /*
1341          * If necessary, return the index of the previous entry too.  This
1342          * is primarily used for detaching a QP from a multicast group.  It
1343          * may be necessary, in that case, to delete an MCG entry from the
1344          * hash chain and having the index of the previous entry is helpful.
1345          */
1346         if (p_indx != NULL) {
1347                 *p_indx = prev_indx;
1348         }
1349         TAVOR_TNF_EXIT(tavor_mcg_walk_mgid_hash);
1350         return (curr_indx);
1351 }
1352 
1353 
1354 /*
1355  * tavor_mcg_setup_new_hdr()
1356  *    Context: Can be called from interrupt or base context.
1357  */
1358 static void
1359 tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, tavor_hw_mcg_t *mcg_hdr,
1360     ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc)
1361 {
1362         TAVOR_TNF_ENTER(tavor_mcg_setup_new_hdr);
1363 
1364         /*
1365          * Fill in the fields of the "shadow" entry used by software
1366          * to track MCG hardware entry
1367          */
1368         mcg->mcg_mgid_h         = mgid.gid_prefix;
1369         mcg->mcg_mgid_l         = mgid.gid_guid;
1370         mcg->mcg_rsrcp          = mcg_rsrc;
1371         mcg->mcg_next_indx = 0;
1372         mcg->mcg_num_qps   = 0;
1373 
1374         /*
1375          * Fill the header fields of the MCG entry (in the temporary copy)
1376          */
1377         mcg_hdr->mgid_h              = mgid.gid_prefix;
1378         mcg_hdr->mgid_l              = mgid.gid_guid;
1379         mcg_hdr->next_gid_indx       = 0;
1380 
1381         TAVOR_TNF_EXIT(tavor_mcg_setup_new_hdr);
1382 }
1383 
1384 
1385 /*
1386  * tavor_mcg_hash_list_remove()
1387  *    Context: Can be called only from user or kernel context.
1388  */
1389 static int
1390 tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
1391     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry)
1392 {
1393         tavor_mcghdl_t          curr_mcg, prev_mcg, next_mcg;
1394         uint_t                  next_indx;
1395         int                     status;
1396 
1397         /* Get the pointer to "shadow" list for current entry */
1398         curr_mcg = &state->ts_mcghdl[curr_indx];
1399 
1400         /*
1401          * If this is the first entry on a hash chain, then attempt to replace
1402          * the entry with the next entry on the chain.  If there are no
1403          * subsequent entries on the chain, then this is the only entry and
1404          * should be invalidated.
1405          */
1406         if (curr_indx == prev_indx) {
1407 
1408                 /*
1409                  * If this is the only entry on the chain, then invalidate it.
1410                  * Note:  Invalidating an MCG entry means writing all zeros
1411                  * to the entry.  This is only necessary for those MCG
1412                  * entries that are the "head" entries of the individual hash
1413                  * chains.  Regardless of whether this operation returns
1414                  * success or failure, return that result to the caller.
1415                  */
1416                 next_indx = curr_mcg->mcg_next_indx;
1417                 if (next_indx == 0) {
1418                         status = tavor_mcg_entry_invalidate(state, mcg_entry,
1419                             curr_indx);
1420                         bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1421                         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1422                         return (status);
1423                 }
1424 
1425                 /*
1426                  * Otherwise, this is just the first entry on the chain, so
1427                  * grab the next one
1428                  */
1429                 next_mcg = &state->ts_mcghdl[next_indx];
1430 
1431                 /*
1432                  * Read the next MCG entry into the temporary MCG.  Note:
1433                  * In general, this operation shouldn't fail.  If it does,
1434                  * then it is an indication that something (probably in HW,
1435                  * but maybe in SW) has gone seriously wrong.
1436                  */
1437                 status = tavor_read_mgm_cmd_post(state, mcg_entry, next_indx,
1438                     TAVOR_CMD_NOSLEEP_SPIN);
1439                 if (status != TAVOR_CMD_SUCCESS) {
1440                         TAVOR_WARNING(state, "failed to read MCG entry");
1441                         cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
1442                             "%08x\n", status);
1443                         TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1444                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1445                             tnf_uint, indx, next_indx);
1446                         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1447                         return (ibc_get_ci_failure(0));
1448                 }
1449 
1450                 /*
1451                  * Copy/Write the temporary MCG back to the hardware MCG list
1452                  * using the current index.  This essentially removes the
1453                  * current MCG entry from the list by writing over it with
1454                  * the next one.  If this is successful, then we can do the
1455                  * same operation for the "shadow" list.  And we can also
1456                  * free up the Tavor MCG entry resource that was associated
1457                  * with the (old) next entry.  Note:  In general, this
1458                  * operation shouldn't fail.  If it does, then it is an
1459                  * indication that something (probably in HW, but maybe in SW)
1460                  * has gone seriously wrong.
1461                  */
1462                 status = tavor_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1463                     TAVOR_CMD_NOSLEEP_SPIN);
1464                 if (status != TAVOR_CMD_SUCCESS) {
1465                         TAVOR_WARNING(state, "failed to write MCG entry");
1466                         cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1467                             "%08x\n", status);
1468                         TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1469                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1470                             tnf_uint, indx, curr_indx);
1471                         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1472                         return (ibc_get_ci_failure(0));
1473                 }
1474 
1475                 /*
1476                  * Copy all the software tracking information from the next
1477                  * entry on the "shadow" MCG list into the current entry on
1478                  * the list.  Then invalidate (zero out) the other "shadow"
1479                  * list entry.
1480                  */
1481                 bcopy(next_mcg, curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1482                 bzero(next_mcg, sizeof (struct tavor_sw_mcg_list_s));
1483 
1484                 /*
1485                  * Free up the Tavor MCG entry resource used by the "next"
1486                  * MCG entry.  That resource is no longer needed by any
1487                  * MCG entry which is first on a hash chain (like the "next"
1488                  * entry has just become).
1489                  */
1490                 tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1491 
1492                 TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1493                 return (DDI_SUCCESS);
1494         }
1495 
1496         /*
1497          * Else if this is the last entry on the hash chain (or a middle
1498          * entry, then we update the previous entry's "next_gid_index" field
1499          * to make it point instead to the next entry on the chain.  By
1500          * skipping over the removed entry in this way, we can then free up
1501          * any resources associated with the current entry.  Note:  We don't
1502          * need to invalidate the "skipped over" hardware entry because it
1503          * will no be longer connected to any hash chains, and if/when it is
1504          * finally re-used, it will be written with entirely new values.
1505          */
1506 
1507         /*
1508          * Read the next MCG entry into the temporary MCG.  Note:  In general,
1509          * this operation shouldn't fail.  If it does, then it is an
1510          * indication that something (probably in HW, but maybe in SW) has
1511          * gone seriously wrong.
1512          */
1513         status = tavor_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1514             TAVOR_CMD_NOSLEEP_SPIN);
1515         if (status != TAVOR_CMD_SUCCESS) {
1516                 TAVOR_WARNING(state, "failed to read MCG entry");
1517                 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1518                     status);
1519                 TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1520                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1521                     tnf_uint, indx, prev_indx);
1522                 TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1523                 return (ibc_get_ci_failure(0));
1524         }
1525 
1526         /*
1527          * Finally, we update the "next_gid_indx" field in the temporary MCG
1528          * and attempt to write the entry back into the Tavor MCG table.  If
1529          * this succeeds, then we update the "shadow" list to reflect the
1530          * change, free up the Tavor MCG entry resource that was associated
1531          * with the current entry, and return success.  Note:  In general,
1532          * this operation shouldn't fail.  If it does, then it is an indication
1533          * that something (probably in HW, but maybe in SW) has gone seriously
1534          * wrong.
1535          */
1536         mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1537         status = tavor_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1538             TAVOR_CMD_NOSLEEP_SPIN);
1539         if (status != TAVOR_CMD_SUCCESS) {
1540                 TAVOR_WARNING(state, "failed to write MCG entry");
1541                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1542                     status);
1543                 TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1544                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1545                     tnf_uint, indx, prev_indx);
1546                 TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1547                 return (ibc_get_ci_failure(0));
1548         }
1549 
1550         /*
1551          * Get the pointer to the "shadow" MCG list entry for the previous
1552          * MCG.  Update its "mcg_next_indx" to point to the next entry
1553          * the one after the current entry. Note:  This next index may be
1554          * zero, indicating the end of the list.
1555          */
1556         prev_mcg = &state->ts_mcghdl[prev_indx];
1557         prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1558 
1559         /*
1560          * Free up the Tavor MCG entry resource used by the current entry.
1561          * This resource is no longer needed because the chain now skips over
1562          * the current entry.  Then invalidate (zero out) the current "shadow"
1563          * list entry.
1564          */
1565         tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1566         bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1567 
1568         TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1569         return (DDI_SUCCESS);
1570 }
1571 
1572 
1573 /*
1574  * tavor_mcg_entry_invalidate()
1575  *    Context: Can be called only from user or kernel context.
1576  */
1577 static int
1578 tavor_mcg_entry_invalidate(tavor_state_t *state, tavor_hw_mcg_t *mcg_entry,
1579     uint_t indx)
1580 {
1581         int             status;
1582 
1583         TAVOR_TNF_ENTER(tavor_mcg_entry_invalidate);
1584 
1585         /*
1586          * Invalidate the hardware MCG entry by zeroing out this temporary
1587          * MCG and writing it the the hardware.  Note: In general, this
1588          * operation shouldn't fail.  If it does, then it is an indication
1589          * that something (probably in HW, but maybe in SW) has gone seriously
1590          * wrong.
1591          */
1592         bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
1593         status = tavor_write_mgm_cmd_post(state, mcg_entry, indx,
1594             TAVOR_CMD_NOSLEEP_SPIN);
1595         if (status != TAVOR_CMD_SUCCESS) {
1596                 TAVOR_WARNING(state, "failed to write MCG entry");
1597                 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1598                     status);
1599                 TNF_PROBE_2(tavor_mcg_entry_invalidate_write_mgm_cmd_fail,
1600                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1601                     tnf_uint, indx, indx);
1602                 TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1603                 return (ibc_get_ci_failure(0));
1604         }
1605 
1606         TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1607         return (DDI_SUCCESS);
1608 }
1609 
1610 
1611 /*
1612  * tavor_mgid_is_valid()
1613  *    Context: Can be called from interrupt or base context.
1614  */
1615 static int
1616 tavor_mgid_is_valid(ib_gid_t gid)
1617 {
1618         uint_t          topbits, flags, scope;
1619 
1620         TAVOR_TNF_ENTER(tavor_mgid_is_valid);
1621 
1622         /*
1623          * According to IBA 1.1 specification (section 4.1.1) a valid
1624          * "multicast GID" must have its top eight bits set to all ones
1625          */
1626         topbits = (gid.gid_prefix >> TAVOR_MCG_TOPBITS_SHIFT) &
1627             TAVOR_MCG_TOPBITS_MASK;
1628         if (topbits != TAVOR_MCG_TOPBITS) {
1629                 TNF_PROBE_0(tavor_mgid_is_valid_invbits_fail, TAVOR_TNF_ERROR,
1630                     "");
1631                 TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1632                 return (0);
1633         }
1634 
1635         /*
1636          * The next 4 bits are the "flag" bits.  These are valid only
1637          * if they are "0" (which correspond to permanently assigned/
1638          * "well-known" multicast GIDs) or "1" (for so-called "transient"
1639          * multicast GIDs).  All other values are reserved.
1640          */
1641         flags = (gid.gid_prefix >> TAVOR_MCG_FLAGS_SHIFT) &
1642             TAVOR_MCG_FLAGS_MASK;
1643         if (!((flags == TAVOR_MCG_FLAGS_PERM) ||
1644             (flags == TAVOR_MCG_FLAGS_NONPERM))) {
1645                 TNF_PROBE_1(tavor_mgid_is_valid_invflags_fail, TAVOR_TNF_ERROR,
1646                     "", tnf_uint, flags, flags);
1647                 TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1648                 return (0);
1649         }
1650 
1651         /*
1652          * The next 4 bits are the "scope" bits.  These are valid only
1653          * if they are "2" (Link-local), "5" (Site-local), "8"
1654          * (Organization-local) or "E" (Global).  All other values
1655          * are reserved (or currently unassigned).
1656          */
1657         scope = (gid.gid_prefix >> TAVOR_MCG_SCOPE_SHIFT) &
1658             TAVOR_MCG_SCOPE_MASK;
1659         if (!((scope == TAVOR_MCG_SCOPE_LINKLOC) ||
1660             (scope == TAVOR_MCG_SCOPE_SITELOC)   ||
1661             (scope == TAVOR_MCG_SCOPE_ORGLOC)    ||
1662             (scope == TAVOR_MCG_SCOPE_GLOBAL))) {
1663                 TNF_PROBE_1(tavor_mgid_is_valid_invscope_fail, TAVOR_TNF_ERROR,
1664                     "", tnf_uint, scope, scope);
1665                 TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1666                 return (0);
1667         }
1668 
1669         /*
1670          * If it passes all of the above checks, then we will consider it
1671          * a valid multicast GID.
1672          */
1673         TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1674         return (1);
1675 }
1676 
1677 
1678 /*
1679  * tavor_mlid_is_valid()
1680  *    Context: Can be called from interrupt or base context.
1681  */
1682 static int
1683 tavor_mlid_is_valid(ib_lid_t lid)
1684 {
1685         TAVOR_TNF_ENTER(tavor_mlid_is_valid);
1686 
1687         /*
1688          * According to IBA 1.1 specification (section 4.1.1) a valid
1689          * "multicast DLID" must be between 0xC000 and 0xFFFE.
1690          */
1691         if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1692                 TNF_PROBE_1(tavor_mlid_is_valid_invdlid_fail, TAVOR_TNF_ERROR,
1693                     "", tnf_uint, mlid, lid);
1694                 TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1695                 return (0);
1696         }
1697 
1698         TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1699         return (1);
1700 }
1701 
1702 
1703 /*
1704  * tavor_pd_alloc()
1705  *    Context: Can be called only from user or kernel context.
1706  */
1707 int
1708 tavor_pd_alloc(tavor_state_t *state, tavor_pdhdl_t *pdhdl, uint_t sleepflag)
1709 {
1710         tavor_rsrc_t    *rsrc;
1711         tavor_pdhdl_t   pd;
1712         int             status;
1713 
1714         TAVOR_TNF_ENTER(tavor_pd_alloc);
1715 
1716         /*
1717          * Allocate the software structure for tracking the protection domain
1718          * (i.e. the Tavor Protection Domain handle).  By default each PD
1719          * structure will have a unique PD number assigned to it.  All that
1720          * is necessary is for software to initialize the PD reference count
1721          * (to zero) and return success.
1722          */
1723         status = tavor_rsrc_alloc(state, TAVOR_PDHDL, 1, sleepflag, &rsrc);
1724         if (status != DDI_SUCCESS) {
1725                 TNF_PROBE_0(tavor_pd_alloc_rsrcalloc_fail, TAVOR_TNF_ERROR, "");
1726                 TAVOR_TNF_EXIT(tavor_pd_alloc);
1727                 return (IBT_INSUFF_RESOURCE);
1728         }
1729         pd = (tavor_pdhdl_t)rsrc->tr_addr;
1730 
1731         pd->pd_refcnt = 0;
1732         *pdhdl = pd;
1733 
1734         TAVOR_TNF_EXIT(tavor_pd_alloc);
1735         return (DDI_SUCCESS);
1736 }
1737 
1738 
1739 /*
1740  * tavor_pd_free()
1741  *    Context: Can be called only from user or kernel context.
1742  */
1743 int
1744 tavor_pd_free(tavor_state_t *state, tavor_pdhdl_t *pdhdl)
1745 {
1746         tavor_rsrc_t    *rsrc;
1747         tavor_pdhdl_t   pd;
1748 
1749         TAVOR_TNF_ENTER(tavor_pd_free);
1750 
1751         /*
1752          * Pull all the necessary information from the Tavor Protection Domain
1753          * handle.  This is necessary here because the resource for the
1754          * PD is going to be freed up as part of this operation.
1755          */
1756         pd   = *pdhdl;
1757         rsrc = pd->pd_rsrcp;
1758 
1759         /*
1760          * Check the PD reference count.  If the reference count is non-zero,
1761          * then it means that this protection domain is still referenced by
1762          * some memory region, queue pair, address handle, or other IB object
1763          * If it is non-zero, then return an error.  Otherwise, free the
1764          * Tavor resource and return success.
1765          */
1766         if (pd->pd_refcnt != 0) {
1767                 TNF_PROBE_1(tavor_pd_free_refcnt_fail, TAVOR_TNF_ERROR, "",
1768                     tnf_int, refcnt, pd->pd_refcnt);
1769                 TAVOR_TNF_EXIT(tavor_pd_free);
1770                 return (IBT_PD_IN_USE);
1771         }
1772 
1773         /* Free the Tavor Protection Domain handle */
1774         tavor_rsrc_free(state, &rsrc);
1775 
1776         /* Set the pdhdl pointer to NULL and return success */
1777         *pdhdl = (tavor_pdhdl_t)NULL;
1778 
1779         TAVOR_TNF_EXIT(tavor_pd_free);
1780         return (DDI_SUCCESS);
1781 }
1782 
1783 
1784 /*
1785  * tavor_pd_refcnt_inc()
1786  *    Context: Can be called from interrupt or base context.
1787  */
1788 void
1789 tavor_pd_refcnt_inc(tavor_pdhdl_t pd)
1790 {
1791         /* Increment the protection domain's reference count */
1792         mutex_enter(&pd->pd_lock);
1793         TNF_PROBE_1_DEBUG(tavor_pd_refcnt_inc, TAVOR_TNF_TRACE, "",
1794             tnf_uint, refcnt, pd->pd_refcnt);
1795         pd->pd_refcnt++;
1796         mutex_exit(&pd->pd_lock);
1797 
1798 }
1799 
1800 
1801 /*
1802  * tavor_pd_refcnt_dec()
1803  *    Context: Can be called from interrupt or base context.
1804  */
1805 void
1806 tavor_pd_refcnt_dec(tavor_pdhdl_t pd)
1807 {
1808         /* Decrement the protection domain's reference count */
1809         mutex_enter(&pd->pd_lock);
1810         pd->pd_refcnt--;
1811         TNF_PROBE_1_DEBUG(tavor_pd_refcnt_dec, TAVOR_TNF_TRACE, "",
1812             tnf_uint, refcnt, pd->pd_refcnt);
1813         mutex_exit(&pd->pd_lock);
1814 
1815 }
1816 
1817 
1818 /*
1819  * tavor_port_query()
1820  *    Context: Can be called only from user or kernel context.
1821  */
1822 int
1823 tavor_port_query(tavor_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1824 {
1825         sm_portinfo_t           portinfo;
1826         sm_guidinfo_t           guidinfo;
1827         sm_pkey_table_t         pkeytable;
1828         ib_gid_t                *sgid;
1829         uint_t                  sgid_max, pkey_max, tbl_size;
1830         int                     i, j, indx, status;
1831 
1832         TAVOR_TNF_ENTER(tavor_port_query);
1833 
1834         /* Validate that specified port number is legal */
1835         if (!tavor_portnum_is_valid(state, port)) {
1836                 TNF_PROBE_1(tavor_port_query_inv_portnum_fail,
1837                     TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1838                 TAVOR_TNF_EXIT(tavor_port_query);
1839                 return (IBT_HCA_PORT_INVALID);
1840         }
1841 
1842         /*
1843          * We use the Tavor MAD_IFC command to post a GetPortInfo MAD
1844          * to the firmware (for the specified port number).  This returns
1845          * a full PortInfo MAD (in "portinfo") which we subsequently
1846          * parse to fill in the "ibt_hca_portinfo_t" structure returned
1847          * to the IBTF.
1848          */
1849         status = tavor_getportinfo_cmd_post(state, port,
1850             TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1851         if (status != TAVOR_CMD_SUCCESS) {
1852                 cmn_err(CE_CONT, "Tavor: GetPortInfo (port %02d) command "
1853                     "failed: %08x\n", port, status);
1854                 TNF_PROBE_1(tavor_port_query_getportinfo_cmd_fail,
1855                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1856                 TAVOR_TNF_EXIT(tavor_port_query);
1857                 return (ibc_get_ci_failure(0));
1858         }
1859 
1860         /*
1861          * Parse the PortInfo MAD and fill in the IBTF structure
1862          */
1863         pi->p_base_lid               = portinfo.LID;
1864         pi->p_qkey_violations        = portinfo.Q_KeyViolations;
1865         pi->p_pkey_violations        = portinfo.P_KeyViolations;
1866         pi->p_sm_sl          = portinfo.MasterSMSL;
1867         pi->p_sm_lid         = portinfo.MasterSMLID;
1868         pi->p_linkstate              = portinfo.PortState;
1869         pi->p_port_num               = portinfo.LocalPortNum;
1870         pi->p_phys_state     = portinfo.PortPhysicalState;
1871         pi->p_width_supported        = portinfo.LinkWidthSupported;
1872         pi->p_width_enabled  = portinfo.LinkWidthEnabled;
1873         pi->p_width_active   = portinfo.LinkWidthActive;
1874         pi->p_speed_supported        = portinfo.LinkSpeedSupported;
1875         pi->p_speed_enabled  = portinfo.LinkSpeedEnabled;
1876         pi->p_speed_active   = portinfo.LinkSpeedActive;
1877         pi->p_mtu            = portinfo.MTUCap;
1878         pi->p_lmc            = portinfo.LMC;
1879         pi->p_max_vl         = portinfo.VLCap;
1880         pi->p_subnet_timeout = portinfo.SubnetTimeOut;
1881         pi->p_msg_sz         = ((uint32_t)1 << TAVOR_QP_LOG_MAX_MSGSZ);
1882         tbl_size = state->ts_cfg_profile->cp_log_max_gidtbl;
1883         pi->p_sgid_tbl_sz    = (1 << tbl_size);
1884         tbl_size = state->ts_cfg_profile->cp_log_max_pkeytbl;
1885         pi->p_pkey_tbl_sz    = (1 << tbl_size);
1886 
1887         /*
1888          * Convert InfiniBand-defined port capability flags to the format
1889          * specified by the IBTF
1890          */
1891         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1892                 pi->p_capabilities |= IBT_PORT_CAP_SM;
1893         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1894                 pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1895         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1896                 pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1897         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1898                 pi->p_capabilities |= IBT_PORT_CAP_DM;
1899         if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1900                 pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1901 
1902         /*
1903          * Fill in the SGID table.  Since the only access to the Tavor
1904          * GID tables is through the firmware's MAD_IFC interface, we
1905          * post as many GetGUIDInfo MADs as necessary to read in the entire
1906          * contents of the SGID table (for the specified port).  Note:  The
1907          * GetGUIDInfo command only gets eight GUIDs per operation.  These
1908          * GUIDs are then appended to the GID prefix for the port (from the
1909          * GetPortInfo above) to form the entire SGID table.
1910          */
1911         for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1912                 status = tavor_getguidinfo_cmd_post(state, port, i >> 3,
1913                     TAVOR_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1914                 if (status != TAVOR_CMD_SUCCESS) {
1915                         cmn_err(CE_CONT, "Tavor: GetGUIDInfo (port %02d) "
1916                             "command failed: %08x\n", port, status);
1917                         TNF_PROBE_1(tavor_port_query_getguidinfo_cmd_fail,
1918                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1919                         TAVOR_TNF_EXIT(tavor_port_query);
1920                         return (ibc_get_ci_failure(0));
1921                 }
1922 
1923                 /* Figure out how many of the entries are valid */
1924                 sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
1925                 for (j = 0; j < sgid_max; j++) {
1926                         indx = (i + j);
1927                         sgid = &pi->p_sgid_tbl[indx];
1928                         sgid->gid_prefix = portinfo.GidPrefix;
1929                         sgid->gid_guid        = guidinfo.GUIDBlocks[j];
1930                 }
1931         }
1932 
1933         /*
1934          * Fill in the PKey table.  Just as for the GID tables above, the
1935          * only access to the Tavor PKey tables is through the firmware's
1936          * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
1937          * to read in the entire contents of the PKey table (for the specified
1938          * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
1939          * operation.
1940          */
1941         for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
1942                 status = tavor_getpkeytable_cmd_post(state, port, i,
1943                     TAVOR_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
1944                 if (status != TAVOR_CMD_SUCCESS) {
1945                         cmn_err(CE_CONT, "Tavor: GetPKeyTable (port %02d) "
1946                             "command failed: %08x\n", port, status);
1947                         TNF_PROBE_1(tavor_port_query_getpkeytable_cmd_fail,
1948                             TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1949                         TAVOR_TNF_EXIT(tavor_port_query);
1950                         return (ibc_get_ci_failure(0));
1951                 }
1952 
1953                 /* Figure out how many of the entries are valid */
1954                 pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
1955                 for (j = 0; j < pkey_max; j++) {
1956                         indx = (i + j);
1957                         pi->p_pkey_tbl[indx] = pkeytable.P_KeyTableBlocks[j];
1958                 }
1959         }
1960 
1961         TAVOR_TNF_EXIT(tavor_port_query);
1962         return (DDI_SUCCESS);
1963 }
1964 
1965 
1966 /*
1967  * tavor_port_modify()
1968  *    Context: Can be called only from user or kernel context.
1969  */
1970 /* ARGSUSED */
1971 int
1972 tavor_port_modify(tavor_state_t *state, uint8_t port,
1973     ibt_port_modify_flags_t flags, uint8_t init_type)
1974 {
1975         sm_portinfo_t   portinfo;
1976         uint32_t        capmask, reset_qkey;
1977         int             status;
1978 
1979         TAVOR_TNF_ENTER(tavor_port_modify);
1980 
1981         /*
1982          * Return an error if either of the unsupported flags are set
1983          */
1984         if ((flags & IBT_PORT_SHUTDOWN) ||
1985             (flags & IBT_PORT_SET_INIT_TYPE)) {
1986                 TNF_PROBE_1(tavor_port_modify_inv_flags_fail,
1987                     TAVOR_TNF_ERROR, "", tnf_uint, flags, flags);
1988                 TAVOR_TNF_EXIT(tavor_port_modify);
1989                 return (IBT_NOT_SUPPORTED);
1990         }
1991 
1992         /*
1993          * Determine whether we are trying to reset the QKey counter
1994          */
1995         reset_qkey = (flags & IBT_PORT_RESET_QKEY) ? 1 : 0;
1996 
1997         /* Validate that specified port number is legal */
1998         if (!tavor_portnum_is_valid(state, port)) {
1999                 TNF_PROBE_1(tavor_port_modify_inv_portnum_fail,
2000                     TAVOR_TNF_ERROR, "", tnf_uint, port, port);
2001                 TAVOR_TNF_EXIT(tavor_port_modify);
2002                 return (IBT_HCA_PORT_INVALID);
2003         }
2004 
2005         /*
2006          * Use the Tavor MAD_IFC command to post a GetPortInfo MAD to the
2007          * firmware (for the specified port number).  This returns a full
2008          * PortInfo MAD (in "portinfo") from which we pull the current
2009          * capability mask.  We then modify the capability mask as directed
2010          * by the "pmod_flags" field, and write the updated capability mask
2011          * using the Tavor SET_IB command (below).
2012          */
2013         status = tavor_getportinfo_cmd_post(state, port,
2014             TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2015         if (status != TAVOR_CMD_SUCCESS) {
2016                 TNF_PROBE_1(tavor_port_modify_getportinfo_cmd_fail,
2017                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2018                 TAVOR_TNF_EXIT(tavor_port_modify);
2019                 return (ibc_get_ci_failure(0));
2020         }
2021 
2022         /*
2023          * Convert InfiniBand-defined port capability flags to the format
2024          * specified by the IBTF.  Specifically, we modify the capability
2025          * mask based on the specified values.
2026          */
2027         capmask = portinfo.CapabilityMask;
2028 
2029         if (flags & IBT_PORT_RESET_SM)
2030                 capmask &= ~SM_CAP_MASK_IS_SM;
2031         else if (flags & IBT_PORT_SET_SM)
2032                 capmask |= SM_CAP_MASK_IS_SM;
2033 
2034         if (flags & IBT_PORT_RESET_SNMP)
2035                 capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2036         else if (flags & IBT_PORT_SET_SNMP)
2037                 capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2038 
2039         if (flags & IBT_PORT_RESET_DEVMGT)
2040                 capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2041         else if (flags & IBT_PORT_SET_DEVMGT)
2042                 capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2043 
2044         if (flags & IBT_PORT_RESET_VENDOR)
2045                 capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2046         else if (flags & IBT_PORT_SET_VENDOR)
2047                 capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2048 
2049         /*
2050          * Use the Tavor SET_IB command to update the capability mask and
2051          * (possibly) reset the QKey violation counter for the specified port.
2052          * Note: In general, this operation shouldn't fail.  If it does, then
2053          * it is an indication that something (probably in HW, but maybe in
2054          * SW) has gone seriously wrong.
2055          */
2056         status = tavor_set_ib_cmd_post(state, capmask, port, reset_qkey,
2057             TAVOR_SLEEPFLAG_FOR_CONTEXT());
2058         if (status != TAVOR_CMD_SUCCESS) {
2059                 TAVOR_WARNING(state, "failed to modify port capabilities");
2060                 cmn_err(CE_CONT, "Tavor: SET_IB (port %02d) command failed: "
2061                     "%08x\n", port, status);
2062                 TNF_PROBE_1(tavor_port_modify_set_ib_cmd_fail,
2063                     TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2064                 TAVOR_TNF_EXIT(tavor_port_modify);
2065                 return (ibc_get_ci_failure(0));
2066         }
2067 
2068         TAVOR_TNF_EXIT(tavor_port_modify);
2069         return (DDI_SUCCESS);
2070 }
2071 
2072 
2073 /*
2074  * tavor_set_addr_path()
2075  *    Context: Can be called from interrupt or base context.
2076  *
2077  * Note: This routine is used for two purposes.  It is used to fill in the
2078  * Tavor UDAV fields, and it is used to fill in the address path information
2079  * for QPs.  Because the two Tavor structures are similar, common fields can
2080  * be filled in here.  Because they are slightly different, however, we pass
2081  * an additional flag to indicate which type is being filled.
2082  */
2083 int
2084 tavor_set_addr_path(tavor_state_t *state, ibt_adds_vect_t *av,
2085     tavor_hw_addr_path_t *path, uint_t type, tavor_qphdl_t qp)
2086 {
2087         uint_t          gidtbl_sz;
2088 
2089         TAVOR_TNF_ENTER(tavor_set_addr_path);
2090 
2091         path->ml_path        = av->av_src_path;
2092         path->rlid   = av->av_dlid;
2093         path->sl     = av->av_srvl;
2094 
2095         /* Port number only valid (in "av_port_num") if this is a UDAV */
2096         if (type == TAVOR_ADDRPATH_UDAV) {
2097                 path->portnum = av->av_port_num;
2098         }
2099 
2100         /*
2101          * Validate (and fill in) static rate.
2102          *
2103          * The stat_rate_sup is used to decide how to set the rate and
2104          * if it is zero, the driver uses the old interface.
2105          */
2106         if (state->ts_devlim.stat_rate_sup) {
2107                 if (av->av_srate == IBT_SRATE_20) {
2108                         path->max_stat_rate = 0; /* 4x@DDR injection rate */
2109                 } else if (av->av_srate == IBT_SRATE_5) {
2110                         path->max_stat_rate = 3; /* 1x@DDR injection rate */
2111                 } else if (av->av_srate == IBT_SRATE_10) {
2112                         path->max_stat_rate = 2; /* 4x@SDR injection rate */
2113                 } else if (av->av_srate == IBT_SRATE_2) {
2114                         path->max_stat_rate = 1; /* 1x@SDR injection rate */
2115                 } else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2116                         path->max_stat_rate = 0; /* Max */
2117                 } else {
2118                         TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2119                             TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2120                         TAVOR_TNF_EXIT(tavor_set_addr_path);
2121                         return (IBT_STATIC_RATE_INVALID);
2122                 }
2123         } else {
2124                 if (av->av_srate == IBT_SRATE_10) {
2125                         path->max_stat_rate = 0; /* 4x@SDR injection rate */
2126                 } else if (av->av_srate == IBT_SRATE_2) {
2127                         path->max_stat_rate = 1; /* 1x@SDR injection rate */
2128                 } else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2129                         path->max_stat_rate = 0; /* Max */
2130                 } else {
2131                         TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2132                             TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2133                         TAVOR_TNF_EXIT(tavor_set_addr_path);
2134                         return (IBT_STATIC_RATE_INVALID);
2135                 }
2136         }
2137 
2138         /*
2139          * If this is a QP operation save asoft copy.
2140          */
2141         if (qp) {
2142                 qp->qp_save_srate = av->av_srate;
2143         }
2144 
2145         /* If "grh" flag is set, then check for valid SGID index too */
2146         gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2147         if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2148                 TNF_PROBE_1(tavor_set_addr_path_inv_sgid_ix_fail,
2149                     TAVOR_TNF_ERROR, "", tnf_uint, sgid_ix, av->av_sgid_ix);
2150                 TAVOR_TNF_EXIT(tavor_set_addr_path);
2151                 return (IBT_SGID_INVALID);
2152         }
2153 
2154         /*
2155          * Fill in all "global" values regardless of the value in the GRH
2156          * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2157          * hardware will ignore the other "global" values as necessary.  Note:
2158          * SW does this here to enable later query operations to return
2159          * exactly the same params that were passed when the addr path was
2160          * last written.
2161          */
2162         path->grh = av->av_send_grh;
2163         if (type == TAVOR_ADDRPATH_QP) {
2164                 path->mgid_index = av->av_sgid_ix;
2165         } else {
2166                 /*
2167                  * For Tavor UDAV, the "mgid_index" field is the index into
2168                  * a combined table (not a per-port table). So some extra
2169                  * calculations are necessary.
2170                  */
2171                 path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2172                     av->av_sgid_ix;
2173         }
2174         path->flow_label = av->av_flow;
2175         path->tclass  = av->av_tclass;
2176         path->hop_limit       = av->av_hop;
2177         path->rgid_h  = av->av_dgid.gid_prefix;
2178 
2179         /*
2180          * According to Tavor PRM, the (31:0) part of rgid_l must be set to
2181          * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2182          * only need to do it for UDAV's.  So we enforce that here.
2183          *
2184          * NOTE: The entire 64 bits worth of GUID info is actually being
2185          * preserved (for UDAVs) by the callers of this function
2186          * (tavor_ah_alloc() and tavor_ah_modify()) and as long as the
2187          * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2188          * "don't care".
2189          */
2190         if ((path->grh) || (type == TAVOR_ADDRPATH_QP)) {
2191                 path->rgid_l = av->av_dgid.gid_guid;
2192         } else {
2193                 path->rgid_l = 0x2;
2194         }
2195 
2196         TAVOR_TNF_EXIT(tavor_set_addr_path);
2197         return (DDI_SUCCESS);
2198 }
2199 
2200 
2201 /*
2202  * tavor_get_addr_path()
2203  *    Context: Can be called from interrupt or base context.
2204  *
2205  * Note: Just like tavor_set_addr_path() above, this routine is used for two
2206  * purposes.  It is used to read in the Tavor UDAV fields, and it is used to
2207  * read in the address path information for QPs.  Because the two Tavor
2208  * structures are similar, common fields can be read in here.  But because
2209  * they are slightly different, we pass an additional flag to indicate which
2210  * type is being read.
2211  */
2212 void
2213 tavor_get_addr_path(tavor_state_t *state, tavor_hw_addr_path_t *path,
2214     ibt_adds_vect_t *av, uint_t type, tavor_qphdl_t qp)
2215 {
2216         uint_t          gidtbl_sz;
2217 
2218         av->av_src_path      = path->ml_path;
2219         av->av_port_num      = path->portnum;
2220         av->av_dlid  = path->rlid;
2221         av->av_srvl  = path->sl;
2222 
2223         /*
2224          * Set "av_ipd" value from max_stat_rate.
2225          */
2226         if (qp) {
2227                 /*
2228                  * If a QP operation use the soft copy
2229                  */
2230                 av->av_srate = qp->qp_save_srate;
2231         } else {
2232                 /*
2233                  * The stat_rate_sup is used to decide how the srate value is
2234                  * set and
2235                  * if it is zero, the driver uses the old interface.
2236                  */
2237                 if (state->ts_devlim.stat_rate_sup) {
2238                         if (path->max_stat_rate      == 0) {
2239                                 av->av_srate = IBT_SRATE_20; /* 4x@DDR rate */
2240                         } else if (path->max_stat_rate       == 1) {
2241                                 av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2242                         } else if (path->max_stat_rate       == 2) {
2243                                 av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2244                         } else if (path->max_stat_rate       == 3) {
2245                                 av->av_srate = IBT_SRATE_5;  /* 1xDDR rate */
2246                         }
2247                 } else {
2248                         if (path->max_stat_rate      == 0) {
2249                                 av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2250                         } else if (path->max_stat_rate       == 1) {
2251                                 av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2252                         }
2253                 }
2254         }
2255 
2256         /*
2257          * Extract all "global" values regardless of the value in the GRH
2258          * flag.  Because "av_send_grh" is set only if "grh" is set, software
2259          * knows to ignore the other "global" values as necessary.  Note: SW
2260          * does it this way to enable these query operations to return exactly
2261          * the same params that were passed when the addr path was last written.
2262          */
2263         av->av_send_grh              = path->grh;
2264         if (type == TAVOR_ADDRPATH_QP) {
2265                 av->av_sgid_ix  = path->mgid_index;
2266         } else {
2267                 /*
2268                  * For Tavor UDAV, the "mgid_index" field is the index into
2269                  * a combined table (not a per-port table). So some extra
2270                  * calculations are necessary.
2271                  */
2272                 gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2273                 av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2274                     gidtbl_sz);
2275         }
2276         av->av_flow          = path->flow_label;
2277         av->av_tclass                = path->tclass;
2278         av->av_hop           = path->hop_limit;
2279         av->av_dgid.gid_prefix       = path->rgid_h;
2280         av->av_dgid.gid_guid = path->rgid_l;
2281 }
2282 
2283 
2284 /*
2285  * tavor_portnum_is_valid()
2286  *    Context: Can be called from interrupt or base context.
2287  */
2288 int
2289 tavor_portnum_is_valid(tavor_state_t *state, uint_t portnum)
2290 {
2291         uint_t  max_port;
2292 
2293         max_port = state->ts_cfg_profile->cp_num_ports;
2294         if ((portnum <= max_port) && (portnum != 0)) {
2295                 return (1);
2296         } else {
2297                 return (0);
2298         }
2299 }
2300 
2301 
2302 /*
2303  * tavor_pkeyindex_is_valid()
2304  *    Context: Can be called from interrupt or base context.
2305  */
2306 int
2307 tavor_pkeyindex_is_valid(tavor_state_t *state, uint_t pkeyindx)
2308 {
2309         uint_t  max_pkeyindx;
2310 
2311         max_pkeyindx = 1 << state->ts_cfg_profile->cp_log_max_pkeytbl;
2312         if (pkeyindx < max_pkeyindx) {
2313                 return (1);
2314         } else {
2315                 return (0);
2316         }
2317 }
2318 
2319 
2320 /*
2321  * tavor_queue_alloc()
2322  *    Context: Can be called from interrupt or base context.
2323  */
2324 int
2325 tavor_queue_alloc(tavor_state_t *state, tavor_qalloc_info_t *qa_info,
2326     uint_t sleepflag)
2327 {
2328         ddi_dma_attr_t          dma_attr;
2329         int                     (*callback)(caddr_t);
2330         uint64_t                realsize, alloc_mask;
2331         uint_t                  dma_xfer_mode, type;
2332         int                     flag, status;
2333 
2334         TAVOR_TNF_ENTER(tavor_queue_alloc);
2335 
2336         /* Set the callback flag appropriately */
2337         callback = (sleepflag == TAVOR_SLEEP) ? DDI_DMA_SLEEP :
2338             DDI_DMA_DONTWAIT;
2339 
2340         /*
2341          * Initialize many of the default DMA attributes.  Then set additional
2342          * alignment restrictions as necessary for the queue memory.  Also
2343          * respect the configured value for IOMMU bypass
2344          */
2345         tavor_dma_attr_init(&dma_attr);
2346         dma_attr.dma_attr_align = qa_info->qa_bind_align;
2347         type = state->ts_cfg_profile->cp_iommu_bypass;
2348         if (type == TAVOR_BINDMEM_BYPASS) {
2349                 dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2350         }
2351 
2352         /* Allocate a DMA handle */
2353         status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, callback, NULL,
2354             &qa_info->qa_dmahdl);
2355         if (status != DDI_SUCCESS) {
2356                 TNF_PROBE_0(tavor_queue_alloc_dmahdl_fail, TAVOR_TNF_ERROR, "");
2357                 TAVOR_TNF_EXIT(tavor_queue_alloc);
2358                 return (DDI_FAILURE);
2359         }
2360 
2361         /*
2362          * Determine the amount of memory to allocate, depending on the values
2363          * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2364          * to solve here is that allocating a DMA handle with IOMMU bypass
2365          * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2366          * that are less than the page size.  Since we may need stricter
2367          * alignments on the memory allocated by ddi_dma_mem_alloc() (e.g. in
2368          * Tavor QP work queue memory allocation), we use the following method
2369          * to calculate how much additional memory to request, and we enforce
2370          * our own alignment on the allocated result.
2371          */
2372         alloc_mask = qa_info->qa_alloc_align - 1;
2373         if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2374                 realsize = qa_info->qa_size;
2375         } else {
2376                 realsize = qa_info->qa_size + alloc_mask;
2377         }
2378 
2379         /*
2380          * If we are to allocate the queue from system memory, then use
2381          * ddi_dma_mem_alloc() to find the space.  Otherwise, if we are to
2382          * allocate the queue from locally-attached DDR memory, then use the
2383          * vmem allocator to find the space.  In either case, return a pointer
2384          * to the memory range allocated (including any necessary alignment
2385          * adjustments), the "real" memory pointer, the "real" size, and a
2386          * ddi_acc_handle_t to use when reading from/writing to the memory.
2387          */
2388         if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2389 
2390                 /*
2391                  * Determine whether to map STREAMING or CONSISTENT.  This is
2392                  * based on the value set in the configuration profile at
2393                  * attach time.
2394                  */
2395                 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
2396 
2397                 /* Allocate system memory for the queue */
2398                 status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2399                     &state->ts_reg_accattr, dma_xfer_mode, callback, NULL,
2400                     (caddr_t *)&qa_info->qa_buf_real,
2401                     (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2402                 if (status != DDI_SUCCESS) {
2403                         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2404                         TNF_PROBE_0(tavor_queue_alloc_dma_memalloc_fail,
2405                             TAVOR_TNF_ERROR, "");
2406                         TAVOR_TNF_EXIT(tavor_queue_alloc);
2407                         return (DDI_FAILURE);
2408                 }
2409 
2410                 /*
2411                  * Save temporary copy of the real pointer.  (This may be
2412                  * modified in the last step below).
2413                  */
2414                 qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2415 
2416         } else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2417 
2418                 /* Allocate userland mappable memory for the queue */
2419                 flag = (sleepflag == TAVOR_SLEEP) ? DDI_UMEM_SLEEP :
2420                     DDI_UMEM_NOSLEEP;
2421                 qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2422                     &qa_info->qa_umemcookie);
2423                 if (qa_info->qa_buf_real == NULL) {
2424                         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2425                         TNF_PROBE_0(tavor_queue_alloc_umem_fail,
2426                             TAVOR_TNF_ERROR, "");
2427                         TAVOR_TNF_EXIT(tavor_queue_alloc);
2428                         return (DDI_FAILURE);
2429                 }
2430 
2431                 /*
2432                  * Save temporary copy of the real pointer.  (This may be
2433                  * modified in the last step below).
2434                  */
2435                 qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2436 
2437         } else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2438 
2439                 /* Allocate DDR memory for the queue */
2440                 flag = (sleepflag == TAVOR_SLEEP) ? VM_SLEEP : VM_NOSLEEP;
2441                 qa_info->qa_buf_real = (uint32_t *)vmem_xalloc(
2442                     state->ts_ddrvmem, realsize, qa_info->qa_bind_align, 0, 0,
2443                     NULL, NULL, flag);
2444                 if (qa_info->qa_buf_real == NULL) {
2445                         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2446                         TNF_PROBE_0(tavor_queue_alloc_vmxa_fail,
2447                             TAVOR_TNF_ERROR, "");
2448                         TAVOR_TNF_EXIT(tavor_queue_alloc);
2449                         return (DDI_FAILURE);
2450                 }
2451 
2452                 /*
2453                  * Since "qa_buf_real" will be a PCI address (the offset into
2454                  * the DDR memory), we first need to do some calculations to
2455                  * convert it to its kernel mapped address.  (Note: This may
2456                  * be modified again below, when any additional "alloc"
2457                  * alignment constraint is applied).
2458                  */
2459                 qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2460                     state->ts_reg_ddr_baseaddr) + ((uintptr_t)
2461                     qa_info->qa_buf_real - state->ts_ddr.ddr_baseaddr));
2462                 qa_info->qa_buf_realsz       = realsize;
2463                 qa_info->qa_acchdl   = state->ts_reg_ddrhdl;
2464         }
2465 
2466         /*
2467          * The last step is to ensure that the final address ("qa_buf_aligned")
2468          * has the appropriate "alloc" alignment restriction applied to it
2469          * (if necessary).
2470          */
2471         if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2472                 qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2473                     qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2474         }
2475 
2476         TAVOR_TNF_EXIT(tavor_queue_alloc);
2477         return (DDI_SUCCESS);
2478 }
2479 
2480 
2481 /*
2482  * tavor_queue_free()
2483  *    Context: Can be called from interrupt or base context.
2484  */
2485 void
2486 tavor_queue_free(tavor_state_t *state, tavor_qalloc_info_t *qa_info)
2487 {
2488         TAVOR_TNF_ENTER(tavor_queue_free);
2489 
2490         /*
2491          * Depending on how (i.e. from where) we allocated the memory for
2492          * this queue, we choose the appropriate method for releasing the
2493          * resources.
2494          */
2495         if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2496 
2497                 ddi_dma_mem_free(&qa_info->qa_acchdl);
2498 
2499         } else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2500 
2501                 ddi_umem_free(qa_info->qa_umemcookie);
2502 
2503         } else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2504 
2505                 vmem_xfree(state->ts_ddrvmem, qa_info->qa_buf_real,
2506                     qa_info->qa_buf_realsz);
2507         }
2508 
2509         /* Always free the dma handle */
2510         ddi_dma_free_handle(&qa_info->qa_dmahdl);
2511 
2512         TAVOR_TNF_EXIT(tavor_queue_free);
2513 }
2514 
2515 
2516 /*
2517  * tavor_dmaattr_get()
2518  *    Context: Can be called from interrupt or base context.
2519  */
2520 void
2521 tavor_dma_attr_init(ddi_dma_attr_t *dma_attr)
2522 {
2523         dma_attr->dma_attr_version   = DMA_ATTR_V0;
2524         dma_attr->dma_attr_addr_lo   = 0;
2525         dma_attr->dma_attr_addr_hi   = 0xFFFFFFFFFFFFFFFFull;
2526         dma_attr->dma_attr_count_max = 0xFFFFFFFFFFFFFFFFull;
2527         dma_attr->dma_attr_align     = 1;
2528         dma_attr->dma_attr_burstsizes        = 0x3FF;
2529         dma_attr->dma_attr_minxfer   = 1;
2530         dma_attr->dma_attr_maxxfer   = 0xFFFFFFFFFFFFFFFFull;
2531         dma_attr->dma_attr_seg               = 0xFFFFFFFFFFFFFFFFull;
2532         dma_attr->dma_attr_sgllen    = 0x7FFFFFFF;
2533         dma_attr->dma_attr_granular  = 1;
2534         dma_attr->dma_attr_flags     = 0;
2535 }