1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * tavor_mr.c
  28  *    Tavor Memory Region/Window Routines
  29  *
  30  *    Implements all the routines necessary to provide the requisite memory
  31  *    registration verbs.  These include operations like RegisterMemRegion(),
  32  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
  33  *    etc., that affect Memory Regions.  It also includes the verbs that
  34  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
  35  *    and QueryMemWindow().
  36  */
  37 
  38 #include <sys/types.h>
  39 #include <sys/conf.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/modctl.h>
  43 #include <sys/esunddi.h>
  44 
  45 #include <sys/ib/adapters/tavor/tavor.h>
  46 
  47 
  48 /*
  49  * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
  50  * of Tavor memory keys (LKeys and RKeys)
  51  */
  52 static uint_t tavor_debug_memkey_cnt = 0x00000000;
  53 
  54 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
  55     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
  56 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
  57     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
  58     tavor_mr_options_t *op);
  59 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
  60     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
  61     uint_t sleep, uint_t *dereg_level);
  62 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
  63     tavor_bind_info_t *bind, uint_t *mtt_pgsize);
  64 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
  65     ddi_dma_handle_t dmahdl, uint_t sleep);
  66 static void tavor_mr_mem_unbind(tavor_state_t *state,
  67     tavor_bind_info_t *bind);
  68 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
  69     uint32_t mtt_pgsize_bits);
  70 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
  71 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
  72 
  73 /*
  74  * The Tavor umem_lockmemory() callback ops.  When userland memory is
  75  * registered, these callback ops are specified.  The tavor_umap_umemlock_cb()
  76  * callback will be called whenever the memory for the corresponding
  77  * ddi_umem_cookie_t is being freed.
  78  */
  79 static struct umem_callback_ops tavor_umem_cbops = {
  80         UMEM_CALLBACK_VERSION,
  81         tavor_umap_umemlock_cb,
  82 };
  83 
  84 
  85 /*
  86  * tavor_mr_register()
  87  *    Context: Can be called from interrupt or base context.
  88  */
  89 int
  90 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
  91     ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
  92 {
  93         tavor_bind_info_t       bind;
  94         int                     status;
  95 
  96         TAVOR_TNF_ENTER(tavor_mr_register);
  97 
  98         /*
  99          * Fill in the "bind" struct.  This struct provides the majority
 100          * of the information that will be used to distinguish between an
 101          * "addr" binding (as is the case here) and a "buf" binding (see
 102          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 103          * which does most of the "heavy lifting" for the Tavor memory
 104          * registration routines.
 105          */
 106         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 107         bind.bi_addr  = mr_attr->mr_vaddr;
 108         bind.bi_len   = mr_attr->mr_len;
 109         bind.bi_as    = mr_attr->mr_as;
 110         bind.bi_flags = mr_attr->mr_flags;
 111         status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
 112         if (status != DDI_SUCCESS) {
 113                 TNF_PROBE_0(tavor_mr_register_cmnreg_fail,
 114                     TAVOR_TNF_ERROR, "");
 115                 TAVOR_TNF_EXIT(tavor_mr_register);
 116                 return (status);
 117         }
 118 
 119         TAVOR_TNF_EXIT(tavor_mr_register);
 120         return (DDI_SUCCESS);
 121 }
 122 
 123 
 124 /*
 125  * tavor_mr_register_buf()
 126  *    Context: Can be called from interrupt or base context.
 127  */
 128 int
 129 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
 130     ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
 131     tavor_mr_options_t *op)
 132 {
 133         tavor_bind_info_t       bind;
 134         int                     status;
 135 
 136         TAVOR_TNF_ENTER(tavor_mr_register_buf);
 137 
 138         /*
 139          * Fill in the "bind" struct.  This struct provides the majority
 140          * of the information that will be used to distinguish between an
 141          * "addr" binding (see above) and a "buf" binding (as is the case
 142          * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
 143          * which does most of the "heavy lifting" for the Tavor memory
 144          * registration routines.  Note: We have chosen to provide
 145          * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
 146          * not set).  It is not critical what value we choose here as it need
 147          * only be unique for the given RKey (which will happen by default),
 148          * so the choice here is somewhat arbitrary.
 149          */
 150         bind.bi_type  = TAVOR_BINDHDL_BUF;
 151         bind.bi_buf   = buf;
 152         if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
 153                 bind.bi_addr  = mr_attr->mr_vaddr;
 154         } else {
 155                 bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
 156         }
 157         bind.bi_as    = NULL;
 158         bind.bi_len   = (uint64_t)buf->b_bcount;
 159         bind.bi_flags = mr_attr->mr_flags;
 160         status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
 161         if (status != DDI_SUCCESS) {
 162                 TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail,
 163                     TAVOR_TNF_ERROR, "");
 164                 TAVOR_TNF_EXIT(tavor_mr_register_buf);
 165                 return (status);
 166         }
 167 
 168         TAVOR_TNF_EXIT(tavor_mr_register_buf);
 169         return (DDI_SUCCESS);
 170 }
 171 
 172 
 173 /*
 174  * tavor_mr_register_shared()
 175  *    Context: Can be called from interrupt or base context.
 176  */
 177 int
 178 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
 179     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
 180 {
 181         tavor_rsrc_pool_info_t  *rsrc_pool;
 182         tavor_rsrc_t            *mpt, *mtt, *rsrc;
 183         tavor_umap_db_entry_t   *umapdb;
 184         tavor_hw_mpt_t          mpt_entry;
 185         tavor_mrhdl_t           mr;
 186         tavor_bind_info_t       *bind;
 187         ddi_umem_cookie_t       umem_cookie;
 188         size_t                  umem_len;
 189         caddr_t                 umem_addr;
 190         uint64_t                mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
 191         uint_t                  sleep, mr_is_umem;
 192         int                     status, umem_flags;
 193         char                    *errormsg;
 194 
 195         TAVOR_TNF_ENTER(tavor_mr_register_shared);
 196 
 197         /*
 198          * Check the sleep flag.  Ensure that it is consistent with the
 199          * current thread context (i.e. if we are currently in the interrupt
 200          * context, then we shouldn't be attempting to sleep).
 201          */
 202         sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
 203             TAVOR_SLEEP;
 204         if ((sleep == TAVOR_SLEEP) &&
 205             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
 206                 /* Set "status" and "errormsg" and goto failure */
 207                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
 208                 goto mrshared_fail;
 209         }
 210 
 211         /* Increment the reference count on the protection domain (PD) */
 212         tavor_pd_refcnt_inc(pd);
 213 
 214         /*
 215          * Allocate an MPT entry.  This will be filled in with all the
 216          * necessary parameters to define the shared memory region.
 217          * Specifically, it will be made to reference the currently existing
 218          * MTT entries and ownership of the MPT will be passed to the hardware
 219          * in the last step below.  If we fail here, we must undo the
 220          * protection domain reference count.
 221          */
 222         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
 223         if (status != DDI_SUCCESS) {
 224                 /* Set "status" and "errormsg" and goto failure */
 225                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
 226                 goto mrshared_fail1;
 227         }
 228 
 229         /*
 230          * Allocate the software structure for tracking the shared memory
 231          * region (i.e. the Tavor Memory Region handle).  If we fail here, we
 232          * must undo the protection domain reference count and the previous
 233          * resource allocation.
 234          */
 235         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
 236         if (status != DDI_SUCCESS) {
 237                 /* Set "status" and "errormsg" and goto failure */
 238                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
 239                 goto mrshared_fail2;
 240         }
 241         mr = (tavor_mrhdl_t)rsrc->tr_addr;
 242         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
 243 
 244         /*
 245          * Setup and validate the memory region access flags.  This means
 246          * translating the IBTF's enable flags into the access flags that
 247          * will be used in later operations.
 248          */
 249         mr->mr_accflag = 0;
 250         if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
 251                 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
 252         if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
 253                 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
 254         if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
 255                 mr->mr_accflag |= IBT_MR_REMOTE_READ;
 256         if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
 257                 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
 258         if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
 259                 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
 260 
 261         /*
 262          * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
 263          * from a certain number of "constrained" bits (the least significant
 264          * bits) and some number of "unconstrained" bits.  The constrained
 265          * bits must be set to the index of the entry in the MPT table, but
 266          * the unconstrained bits can be set to any value we wish.  Note:
 267          * if no remote access is required, then the RKey value is not filled
 268          * in.  Otherwise both Rkey and LKey are given the same value.
 269          */
 270         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
 271         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
 272             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
 273             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
 274                 mr->mr_rkey = mr->mr_lkey;
 275         }
 276 
 277         /* Grab the MR lock for the current memory region */
 278         mutex_enter(&mrhdl->mr_lock);
 279 
 280         /*
 281          * Check here to see if the memory region has already been partially
 282          * deregistered as a result of a tavor_umap_umemlock_cb() callback.
 283          * If so, this is an error, return failure.
 284          */
 285         if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
 286                 mutex_exit(&mrhdl->mr_lock);
 287                 /* Set "status" and "errormsg" and goto failure */
 288                 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
 289                 goto mrshared_fail3;
 290         }
 291 
 292         /*
 293          * Determine if the original memory was from userland and, if so, pin
 294          * the pages (again) with umem_lockmemory().  This will guarantee a
 295          * separate callback for each of this shared region's MR handles.
 296          * If this is userland memory, then allocate an entry in the
 297          * "userland resources database".  This will later be added to
 298          * the database (after all further memory registration operations are
 299          * successful).  If we fail here, we must undo all the above setup.
 300          */
 301         mr_is_umem = mrhdl->mr_is_umem;
 302         if (mr_is_umem) {
 303                 umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
 304                     ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
 305                 umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
 306                     ~PAGEOFFSET);
 307                 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
 308                     DDI_UMEMLOCK_LONGTERM);
 309                 status = umem_lockmemory(umem_addr, umem_len, umem_flags,
 310                     &umem_cookie, &tavor_umem_cbops, NULL);
 311                 if (status != 0) {
 312                         mutex_exit(&mrhdl->mr_lock);
 313                         /* Set "status" and "errormsg" and goto failure */
 314                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
 315                         goto mrshared_fail3;
 316                 }
 317 
 318                 umapdb = tavor_umap_db_alloc(state->ts_instance,
 319                     (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
 320                     (uint64_t)(uintptr_t)rsrc);
 321                 if (umapdb == NULL) {
 322                         mutex_exit(&mrhdl->mr_lock);
 323                         /* Set "status" and "errormsg" and goto failure */
 324                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 325                         goto mrshared_fail4;
 326                 }
 327         }
 328 
 329         /*
 330          * Copy the MTT resource pointer (and additional parameters) from
 331          * the original Tavor Memory Region handle.  Note: this is normally
 332          * where the tavor_mr_mem_bind() routine would be called, but because
 333          * we already have bound and filled-in MTT entries it is simply a
 334          * matter here of managing the MTT reference count and grabbing the
 335          * address of the MTT table entries (for filling in the shared region's
 336          * MPT entry).
 337          */
 338         mr->mr_mttrsrcp        = mrhdl->mr_mttrsrcp;
 339         mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
 340         mr->mr_bindinfo        = mrhdl->mr_bindinfo;
 341         mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
 342         mutex_exit(&mrhdl->mr_lock);
 343         bind = &mr->mr_bindinfo;
 344         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
 345         mtt = mr->mr_mttrsrcp;
 346 
 347         /*
 348          * Increment the MTT reference count (to reflect the fact that
 349          * the MTT is now shared)
 350          */
 351         (void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
 352 
 353         /*
 354          * Update the new "bind" virtual address.  Do some extra work here
 355          * to ensure proper alignment.  That is, make sure that the page
 356          * offset for the beginning of the old range is the same as the
 357          * offset for this new mapping
 358          */
 359         pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
 360         bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
 361             (mr->mr_bindinfo.bi_addr & pgsize_msk));
 362 
 363         /*
 364          * Get the base address for the MTT table.  This will be necessary
 365          * in the next step when we are setting up the MPT entry.
 366          */
 367         rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
 368         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
 369 
 370         /*
 371          * Fill in the MPT entry.  This is the final step before passing
 372          * ownership of the MPT entry to the Tavor hardware.  We use all of
 373          * the information collected/calculated above to fill in the
 374          * requisite portions of the MPT.
 375          */
 376         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
 377         mpt_entry.m_io    = TAVOR_MEM_CYCLE_GENERATE;
 378         mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
 379         mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
 380         mpt_entry.rw      = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
 381         mpt_entry.rr      = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
 382         mpt_entry.lw      = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
 383         mpt_entry.lr      = 1;
 384         mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
 385         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
 386         mpt_entry.mem_key       = mr->mr_lkey;
 387         mpt_entry.pd            = pd->pd_pdnum;
 388         mpt_entry.start_addr    = bind->bi_addr;
 389         mpt_entry.reg_win_len   = bind->bi_len;
 390         mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
 391         mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
 392         mpt_entry.mttseg_addr_h = mtt_addr >> 32;
 393         mpt_entry.mttseg_addr_l = mtt_addr >> 6;
 394 
 395         /*
 396          * Write the MPT entry to hardware.  Lastly, we pass ownership of
 397          * the entry to the hardware.  Note: in general, this operation
 398          * shouldn't fail.  But if it does, we have to undo everything we've
 399          * done above before returning error.
 400          */
 401         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
 402             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
 403         if (status != TAVOR_CMD_SUCCESS) {
 404                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
 405                     status);
 406                 TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
 407                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 408                 /* Set "status" and "errormsg" and goto failure */
 409                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 410                     "tavor SW2HW_MPT command");
 411                 goto mrshared_fail5;
 412         }
 413 
 414         /*
 415          * Fill in the rest of the Tavor Memory Region handle.  Having
 416          * successfully transferred ownership of the MPT, we can update the
 417          * following fields for use in further operations on the MR.
 418          */
 419         mr->mr_mptrsrcp        = mpt;
 420         mr->mr_mttrsrcp        = mtt;
 421         mr->mr_pdhdl   = pd;
 422         mr->mr_rsrcp   = rsrc;
 423         mr->mr_is_umem         = mr_is_umem;
 424         mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
 425         mr->mr_umem_cbfunc = NULL;
 426         mr->mr_umem_cbarg1 = NULL;
 427         mr->mr_umem_cbarg2 = NULL;
 428 
 429         /*
 430          * If this is userland memory, then we need to insert the previously
 431          * allocated entry into the "userland resources database".  This will
 432          * allow for later coordination between the tavor_umap_umemlock_cb()
 433          * callback and tavor_mr_deregister().
 434          */
 435         if (mr_is_umem) {
 436                 tavor_umap_db_add(umapdb);
 437         }
 438 
 439         *mrhdl_new = mr;
 440 
 441         TAVOR_TNF_EXIT(tavor_mr_register_shared);
 442         return (DDI_SUCCESS);
 443 
 444 /*
 445  * The following is cleanup for all possible failure cases in this routine
 446  */
 447 mrshared_fail5:
 448         (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
 449         if (mr_is_umem) {
 450                 tavor_umap_db_free(umapdb);
 451         }
 452 mrshared_fail4:
 453         if (mr_is_umem) {
 454                 ddi_umem_unlock(umem_cookie);
 455         }
 456 mrshared_fail3:
 457         tavor_rsrc_free(state, &rsrc);
 458 mrshared_fail2:
 459         tavor_rsrc_free(state, &mpt);
 460 mrshared_fail1:
 461         tavor_pd_refcnt_dec(pd);
 462 mrshared_fail:
 463         TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "",
 464             tnf_string, msg, errormsg);
 465         TAVOR_TNF_EXIT(tavor_mr_register_shared);
 466         return (status);
 467 }
 468 
 469 
 470 /*
 471  * tavor_mr_deregister()
 472  *    Context: Can be called from interrupt or base context.
 473  */
 474 /* ARGSUSED */
 475 int
 476 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
 477     uint_t sleep)
 478 {
 479         tavor_rsrc_t            *mpt, *mtt, *rsrc, *mtt_refcnt;
 480         tavor_umap_db_entry_t   *umapdb;
 481         tavor_pdhdl_t           pd;
 482         tavor_mrhdl_t           mr;
 483         tavor_bind_info_t       *bind;
 484         uint64_t                value;
 485         int                     status, shared_mtt;
 486         char                    *errormsg;
 487 
 488         TAVOR_TNF_ENTER(tavor_mr_deregister);
 489 
 490         /*
 491          * Check the sleep flag.  Ensure that it is consistent with the
 492          * current thread context (i.e. if we are currently in the interrupt
 493          * context, then we shouldn't be attempting to sleep).
 494          */
 495         if ((sleep == TAVOR_SLEEP) &&
 496             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
 497                 /* Set "status" and "errormsg" and goto failure */
 498                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
 499                 TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "",
 500                     tnf_string, msg, errormsg);
 501                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 502                 return (status);
 503         }
 504 
 505         /*
 506          * Pull all the necessary information from the Tavor Memory Region
 507          * handle.  This is necessary here because the resource for the
 508          * MR handle is going to be freed up as part of the this
 509          * deregistration
 510          */
 511         mr      = *mrhdl;
 512         mutex_enter(&mr->mr_lock);
 513         mpt     = mr->mr_mptrsrcp;
 514         mtt     = mr->mr_mttrsrcp;
 515         mtt_refcnt = mr->mr_mttrefcntp;
 516         rsrc    = mr->mr_rsrcp;
 517         pd      = mr->mr_pdhdl;
 518         bind    = &mr->mr_bindinfo;
 519 
 520         /*
 521          * Check here to see if the memory region has already been partially
 522          * deregistered as a result of the tavor_umap_umemlock_cb() callback.
 523          * If so, then jump to the end and free the remaining resources.
 524          */
 525         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
 526                 goto mrdereg_finish_cleanup;
 527         }
 528 
 529         /*
 530          * We must drop the "mr_lock" here to ensure that both SLEEP and
 531          * NOSLEEP calls into the firmware work as expected.  Also, if two
 532          * threads are attemping to access this MR (via de-register,
 533          * re-register, or otherwise), then we allow the firmware to enforce
 534          * the checking, that only one deregister is valid.
 535          */
 536         mutex_exit(&mr->mr_lock);
 537 
 538         /*
 539          * Reclaim MPT entry from hardware (if necessary).  Since the
 540          * tavor_mr_deregister() routine is used in the memory region
 541          * reregistration process as well, it is possible that we will
 542          * not always wish to reclaim ownership of the MPT.  Check the
 543          * "level" arg and, if necessary, attempt to reclaim it.  If
 544          * the ownership transfer fails for any reason, we check to see
 545          * what command status was returned from the hardware.  The only
 546          * "expected" error status is the one that indicates an attempt to
 547          * deregister a memory region that has memory windows bound to it
 548          */
 549         if (level >= TAVOR_MR_DEREG_ALL) {
 550                 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
 551                     NULL, 0, mpt->tr_indx, sleep);
 552                 if (status != TAVOR_CMD_SUCCESS) {
 553                         if (status == TAVOR_CMD_REG_BOUND) {
 554                                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 555                                 return (IBT_MR_IN_USE);
 556                         } else {
 557                                 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
 558                                     "failed: %08x\n", status);
 559                                 TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail,
 560                                     TAVOR_TNF_ERROR, "", tnf_uint, status,
 561                                     status);
 562                                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 563                                 return (IBT_INVALID_PARAM);
 564                         }
 565                 }
 566         }
 567 
 568         /*
 569          * Re-grab the mr_lock here.  Since further access to the protected
 570          * 'mr' structure is needed, and we would have returned previously for
 571          * the multiple deregistration case, we can safely grab the lock here.
 572          */
 573         mutex_enter(&mr->mr_lock);
 574 
 575         /*
 576          * If the memory had come from userland, then we do a lookup in the
 577          * "userland resources database".  On success, we free the entry, call
 578          * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
 579          * an indication that the umem_lockmemory() callback has called
 580          * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
 581          * the "mr_umemcookie" field in the MR handle (this will be used
 582          * later to detect that only partial cleaup still remains to be done
 583          * on the MR handle).
 584          */
 585         if (mr->mr_is_umem) {
 586                 status = tavor_umap_db_find(state->ts_instance,
 587                     (uint64_t)(uintptr_t)mr->mr_umemcookie,
 588                     MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
 589                     &umapdb);
 590                 if (status == DDI_SUCCESS) {
 591                         tavor_umap_db_free(umapdb);
 592                         ddi_umem_unlock(mr->mr_umemcookie);
 593                 } else {
 594                         ddi_umem_unlock(mr->mr_umemcookie);
 595                         mr->mr_umemcookie = NULL;
 596                 }
 597         }
 598 
 599         /* mtt_refcnt is NULL in the case of tavor_dma_mr_register() */
 600         if (mtt_refcnt != NULL) {
 601                 /*
 602                  * Decrement the MTT reference count.  Since the MTT resource
 603                  * may be shared between multiple memory regions (as a result
 604                  * of a "RegisterSharedMR" verb) it is important that we not
 605                  * free up or unbind resources prematurely.  If it's not shared
 606                  * (as indicated by the return status), then free the resource.
 607                  */
 608                 shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
 609                 if (!shared_mtt) {
 610                         tavor_rsrc_free(state, &mtt_refcnt);
 611                 }
 612 
 613                 /*
 614                  * Free up the MTT entries and unbind the memory.  Here,
 615                  * as above, we attempt to free these resources only if
 616                  * it is appropriate to do so.
 617                  */
 618                 if (!shared_mtt) {
 619                         if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
 620                                 tavor_mr_mem_unbind(state, bind);
 621                         }
 622                         tavor_rsrc_free(state, &mtt);
 623                 }
 624         }
 625 
 626         /*
 627          * If the MR handle has been invalidated, then drop the
 628          * lock and return success.  Note: This only happens because
 629          * the umem_lockmemory() callback has been triggered.  The
 630          * cleanup here is partial, and further cleanup (in a
 631          * subsequent tavor_mr_deregister() call) will be necessary.
 632          */
 633         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
 634                 mutex_exit(&mr->mr_lock);
 635                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 636                 return (DDI_SUCCESS);
 637         }
 638 
 639 mrdereg_finish_cleanup:
 640         mutex_exit(&mr->mr_lock);
 641 
 642         /* Free the Tavor Memory Region handle */
 643         tavor_rsrc_free(state, &rsrc);
 644 
 645         /* Free up the MPT entry resource */
 646         tavor_rsrc_free(state, &mpt);
 647 
 648         /* Decrement the reference count on the protection domain (PD) */
 649         tavor_pd_refcnt_dec(pd);
 650 
 651         /* Set the mrhdl pointer to NULL and return success */
 652         *mrhdl = NULL;
 653 
 654         TAVOR_TNF_EXIT(tavor_mr_deregister);
 655         return (DDI_SUCCESS);
 656 }
 657 
 658 
 659 /*
 660  * tavor_mr_query()
 661  *    Context: Can be called from interrupt or base context.
 662  */
 663 /* ARGSUSED */
 664 int
 665 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
 666     ibt_mr_query_attr_t *attr)
 667 {
 668         TAVOR_TNF_ENTER(tavor_mr_query);
 669 
 670         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
 671 
 672         mutex_enter(&mr->mr_lock);
 673 
 674         /*
 675          * Check here to see if the memory region has already been partially
 676          * deregistered as a result of a tavor_umap_umemlock_cb() callback.
 677          * If so, this is an error, return failure.
 678          */
 679         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
 680                 mutex_exit(&mr->mr_lock);
 681                 TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, "");
 682                 TAVOR_TNF_EXIT(tavor_mr_query);
 683                 return (IBT_MR_HDL_INVALID);
 684         }
 685 
 686         /* Fill in the queried attributes */
 687         attr->mr_attr_flags = mr->mr_accflag;
 688         attr->mr_pd  = (ibt_pd_hdl_t)mr->mr_pdhdl;
 689 
 690         /* Fill in the "local" attributes */
 691         attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
 692         attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
 693         attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
 694 
 695         /*
 696          * Fill in the "remote" attributes (if necessary).  Note: the
 697          * remote attributes are only valid if the memory region has one
 698          * or more of the remote access flags set.
 699          */
 700         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
 701             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
 702             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
 703                 attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
 704                 attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
 705                 attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
 706         }
 707 
 708         /*
 709          * If region is mapped for streaming (i.e. noncoherent), then set sync
 710          * is required
 711          */
 712         attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
 713             IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
 714 
 715         mutex_exit(&mr->mr_lock);
 716         TAVOR_TNF_EXIT(tavor_mr_query);
 717         return (DDI_SUCCESS);
 718 }
 719 
 720 
 721 /*
 722  * tavor_mr_reregister()
 723  *    Context: Can be called from interrupt or base context.
 724  */
 725 int
 726 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
 727     tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
 728     tavor_mr_options_t *op)
 729 {
 730         tavor_bind_info_t       bind;
 731         int                     status;
 732 
 733         TAVOR_TNF_ENTER(tavor_mr_reregister);
 734 
 735         /*
 736          * Fill in the "bind" struct.  This struct provides the majority
 737          * of the information that will be used to distinguish between an
 738          * "addr" binding (as is the case here) and a "buf" binding (see
 739          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 740          * which does most of the "heavy lifting" for the Tavor memory
 741          * registration (and reregistration) routines.
 742          */
 743         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 744         bind.bi_addr  = mr_attr->mr_vaddr;
 745         bind.bi_len   = mr_attr->mr_len;
 746         bind.bi_as    = mr_attr->mr_as;
 747         bind.bi_flags = mr_attr->mr_flags;
 748         status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
 749         if (status != DDI_SUCCESS) {
 750                 TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail,
 751                     TAVOR_TNF_ERROR, "");
 752                 TAVOR_TNF_EXIT(tavor_mr_reregister);
 753                 return (status);
 754         }
 755 
 756         TAVOR_TNF_EXIT(tavor_mr_reregister);
 757         return (DDI_SUCCESS);
 758 }
 759 
 760 
 761 /*
 762  * tavor_mr_reregister_buf()
 763  *    Context: Can be called from interrupt or base context.
 764  */
 765 int
 766 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
 767     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
 768     tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
 769 {
 770         tavor_bind_info_t       bind;
 771         int                     status;
 772 
 773         TAVOR_TNF_ENTER(tavor_mr_reregister_buf);
 774 
 775         /*
 776          * Fill in the "bind" struct.  This struct provides the majority
 777          * of the information that will be used to distinguish between an
 778          * "addr" binding (see above) and a "buf" binding (as is the case
 779          * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
 780          * which does most of the "heavy lifting" for the Tavor memory
 781          * registration routines.  Note: We have chosen to provide
 782          * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
 783          * not set).  It is not critical what value we choose here as it need
 784          * only be unique for the given RKey (which will happen by default),
 785          * so the choice here is somewhat arbitrary.
 786          */
 787         bind.bi_type  = TAVOR_BINDHDL_BUF;
 788         bind.bi_buf   = buf;
 789         if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
 790                 bind.bi_addr  = mr_attr->mr_vaddr;
 791         } else {
 792                 bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
 793         }
 794         bind.bi_len   = (uint64_t)buf->b_bcount;
 795         bind.bi_flags = mr_attr->mr_flags;
 796         bind.bi_as = NULL;
 797         status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
 798         if (status != DDI_SUCCESS) {
 799                 TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail,
 800                     TAVOR_TNF_ERROR, "");
 801                 TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
 802                 return (status);
 803         }
 804 
 805         TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
 806         return (DDI_SUCCESS);
 807 }
 808 
 809 
 810 /*
 811  * tavor_mr_sync()
 812  *    Context: Can be called from interrupt or base context.
 813  */
 814 /* ARGSUSED */
 815 int
 816 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
 817 {
 818         tavor_mrhdl_t           mrhdl;
 819         uint64_t                seg_vaddr, seg_len, seg_end;
 820         uint64_t                mr_start, mr_end;
 821         uint_t                  type;
 822         int                     status, i;
 823         char                    *errormsg;
 824 
 825         TAVOR_TNF_ENTER(tavor_mr_sync);
 826 
 827         /* Process each of the ibt_mr_sync_t's */
 828         for (i = 0; i < num_segs; i++) {
 829                 mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
 830 
 831                 /* Check for valid memory region handle */
 832                 if (mrhdl == NULL) {
 833                         /* Set "status" and "errormsg" and goto failure */
 834                         TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
 835                         goto mrsync_fail;
 836                 }
 837 
 838                 mutex_enter(&mrhdl->mr_lock);
 839 
 840                 /*
 841                  * Check here to see if the memory region has already been
 842                  * partially deregistered as a result of a
 843                  * tavor_umap_umemlock_cb() callback.  If so, this is an
 844                  * error, return failure.
 845                  */
 846                 if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
 847                         mutex_exit(&mrhdl->mr_lock);
 848                         /* Set "status" and "errormsg" and goto failure */
 849                         TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2");
 850                         goto mrsync_fail;
 851                 }
 852 
 853                 /* Check for valid bounds on sync request */
 854                 seg_vaddr = mr_segs[i].ms_vaddr;
 855                 seg_len   = mr_segs[i].ms_len;
 856                 seg_end   = seg_vaddr + seg_len - 1;
 857                 mr_start  = mrhdl->mr_bindinfo.bi_addr;
 858                 mr_end    = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
 859                 if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
 860                         mutex_exit(&mrhdl->mr_lock);
 861                         /* Set "status" and "errormsg" and goto failure */
 862                         TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr");
 863                         goto mrsync_fail;
 864                 }
 865                 if ((seg_end < mr_start) || (seg_end > mr_end)) {
 866                         mutex_exit(&mrhdl->mr_lock);
 867                         /* Set "status" and "errormsg" and goto failure */
 868                         TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
 869                         goto mrsync_fail;
 870                 }
 871 
 872                 /* Determine what type (i.e. direction) for sync */
 873                 if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
 874                         type = DDI_DMA_SYNC_FORDEV;
 875                 } else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
 876                         type = DDI_DMA_SYNC_FORCPU;
 877                 } else {
 878                         mutex_exit(&mrhdl->mr_lock);
 879                         /* Set "status" and "errormsg" and goto failure */
 880                         TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type");
 881                         goto mrsync_fail;
 882                 }
 883 
 884                 (void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
 885                     (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
 886                 mutex_exit(&mrhdl->mr_lock);
 887         }
 888 
 889         TAVOR_TNF_EXIT(tavor_mr_sync);
 890         return (DDI_SUCCESS);
 891 
 892 mrsync_fail:
 893         TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg,
 894             errormsg);
 895         TAVOR_TNF_EXIT(tavor_mr_sync);
 896         return (status);
 897 }
 898 
 899 
 900 /*
 901  * tavor_mw_alloc()
 902  *    Context: Can be called from interrupt or base context.
 903  */
 904 int
 905 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
 906     tavor_mwhdl_t *mwhdl)
 907 {
 908         tavor_rsrc_t            *mpt, *rsrc;
 909         tavor_hw_mpt_t          mpt_entry;
 910         tavor_mwhdl_t           mw;
 911         uint_t                  sleep;
 912         int                     status;
 913         char                    *errormsg;
 914 
 915         TAVOR_TNF_ENTER(tavor_mw_alloc);
 916 
 917         /*
 918          * Check the sleep flag.  Ensure that it is consistent with the
 919          * current thread context (i.e. if we are currently in the interrupt
 920          * context, then we shouldn't be attempting to sleep).
 921          */
 922         sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
 923         if ((sleep == TAVOR_SLEEP) &&
 924             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
 925                 /* Set "status" and "errormsg" and goto failure */
 926                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
 927                 goto mwalloc_fail;
 928         }
 929 
 930         /* Increment the reference count on the protection domain (PD) */
 931         tavor_pd_refcnt_inc(pd);
 932 
 933         /*
 934          * Allocate an MPT entry (for use as a memory window).  Since the
 935          * Tavor hardware uses the MPT entry for memory regions and for
 936          * memory windows, we will fill in this MPT with all the necessary
 937          * parameters for the memory window.  And then (just as we do for
 938          * memory regions) ownership will be passed to the hardware in the
 939          * final step below.  If we fail here, we must undo the protection
 940          * domain reference count.
 941          */
 942         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
 943         if (status != DDI_SUCCESS) {
 944                 /* Set "status" and "errormsg" and goto failure */
 945                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
 946                 goto mwalloc_fail1;
 947         }
 948 
 949         /*
 950          * Allocate the software structure for tracking the memory window (i.e.
 951          * the Tavor Memory Window handle).  Note: This is actually the same
 952          * software structure used for tracking memory regions, but since many
 953          * of the same properties are needed, only a single structure is
 954          * necessary.  If we fail here, we must undo the protection domain
 955          * reference count and the previous resource allocation.
 956          */
 957         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
 958         if (status != DDI_SUCCESS) {
 959                 /* Set "status" and "errormsg" and goto failure */
 960                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
 961                 goto mwalloc_fail2;
 962         }
 963         mw = (tavor_mwhdl_t)rsrc->tr_addr;
 964         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
 965 
 966         /*
 967          * Calculate an "unbound" RKey from MPT index.  In much the same way
 968          * as we do for memory regions (above), this key is constructed from
 969          * a "constrained" (which depends on the MPT index) and an
 970          * "unconstrained" portion (which may be arbitrarily chosen).
 971          */
 972         tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
 973 
 974         /*
 975          * Fill in the MPT entry.  This is the final step before passing
 976          * ownership of the MPT entry to the Tavor hardware.  We use all of
 977          * the information collected/calculated above to fill in the
 978          * requisite portions of the MPT.  Note: fewer entries in the MPT
 979          * entry are necessary to allocate a memory window.
 980          */
 981         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
 982         mpt_entry.reg_win       = TAVOR_MPT_IS_WINDOW;
 983         mpt_entry.mem_key       = mw->mr_rkey;
 984         mpt_entry.pd            = pd->pd_pdnum;
 985 
 986         /*
 987          * Write the MPT entry to hardware.  Lastly, we pass ownership of
 988          * the entry to the hardware.  Note: in general, this operation
 989          * shouldn't fail.  But if it does, we have to undo everything we've
 990          * done above before returning error.
 991          */
 992         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
 993             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
 994         if (status != TAVOR_CMD_SUCCESS) {
 995                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
 996                     status);
 997                 TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail,
 998                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 999                 /* Set "status" and "errormsg" and goto failure */
1000                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1001                     "tavor SW2HW_MPT command");
1002                 goto mwalloc_fail3;
1003         }
1004 
1005         /*
1006          * Fill in the rest of the Tavor Memory Window handle.  Having
1007          * successfully transferred ownership of the MPT, we can update the
1008          * following fields for use in further operations on the MW.
1009          */
1010         mw->mr_mptrsrcp      = mpt;
1011         mw->mr_pdhdl = pd;
1012         mw->mr_rsrcp = rsrc;
1013         *mwhdl = mw;
1014 
1015         TAVOR_TNF_EXIT(tavor_mw_alloc);
1016         return (DDI_SUCCESS);
1017 
1018 mwalloc_fail3:
1019         tavor_rsrc_free(state, &rsrc);
1020 mwalloc_fail2:
1021         tavor_rsrc_free(state, &mpt);
1022 mwalloc_fail1:
1023         tavor_pd_refcnt_dec(pd);
1024 mwalloc_fail:
1025         TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "",
1026             tnf_string, msg, errormsg);
1027         TAVOR_TNF_EXIT(tavor_mw_alloc);
1028         return (status);
1029 }
1030 
1031 
1032 /*
1033  * tavor_mw_free()
1034  *    Context: Can be called from interrupt or base context.
1035  */
1036 int
1037 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
1038 {
1039         tavor_rsrc_t            *mpt, *rsrc;
1040         tavor_mwhdl_t           mw;
1041         int                     status;
1042         char                    *errormsg;
1043         tavor_pdhdl_t           pd;
1044 
1045         TAVOR_TNF_ENTER(tavor_mw_free);
1046 
1047         /*
1048          * Check the sleep flag.  Ensure that it is consistent with the
1049          * current thread context (i.e. if we are currently in the interrupt
1050          * context, then we shouldn't be attempting to sleep).
1051          */
1052         if ((sleep == TAVOR_SLEEP) &&
1053             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1054                 /* Set "status" and "errormsg" and goto failure */
1055                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
1056                 TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "",
1057                     tnf_string, msg, errormsg);
1058                 TAVOR_TNF_EXIT(tavor_mw_free);
1059                 return (status);
1060         }
1061 
1062         /*
1063          * Pull all the necessary information from the Tavor Memory Window
1064          * handle.  This is necessary here because the resource for the
1065          * MW handle is going to be freed up as part of the this operation.
1066          */
1067         mw      = *mwhdl;
1068         mutex_enter(&mw->mr_lock);
1069         mpt     = mw->mr_mptrsrcp;
1070         rsrc    = mw->mr_rsrcp;
1071         pd      = mw->mr_pdhdl;
1072         mutex_exit(&mw->mr_lock);
1073         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1074 
1075         /*
1076          * Reclaim the MPT entry from hardware.  Note: in general, it is
1077          * unexpected for this operation to return an error.
1078          */
1079         status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1080             0, mpt->tr_indx, sleep);
1081         if (status != TAVOR_CMD_SUCCESS) {
1082                 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
1083                     status);
1084                 TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "",
1085                     tnf_uint, status, status);
1086                 TAVOR_TNF_EXIT(tavor_mw_free);
1087                 return (IBT_INVALID_PARAM);
1088         }
1089 
1090         /* Free the Tavor Memory Window handle */
1091         tavor_rsrc_free(state, &rsrc);
1092 
1093         /* Free up the MPT entry resource */
1094         tavor_rsrc_free(state, &mpt);
1095 
1096         /* Decrement the reference count on the protection domain (PD) */
1097         tavor_pd_refcnt_dec(pd);
1098 
1099         /* Set the mwhdl pointer to NULL and return success */
1100         *mwhdl = NULL;
1101 
1102         TAVOR_TNF_EXIT(tavor_mw_free);
1103         return (DDI_SUCCESS);
1104 }
1105 
1106 
1107 /*
1108  * tavor_mr_keycalc()
1109  *    Context: Can be called from interrupt or base context.
1110  */
1111 void
1112 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1113 {
1114         uint32_t        tmp, log_num_mpt;
1115 
1116         /*
1117          * Generate a simple key from counter.  Note:  We increment this
1118          * static variable _intentionally_ without any kind of mutex around
1119          * it.  First, single-threading all operations through a single lock
1120          * would be a bad idea (from a performance point-of-view).  Second,
1121          * the upper "unconstrained" bits don't really have to be unique
1122          * because the lower bits are guaranteed to be (although we do make a
1123          * best effort to ensure that they are).  Third, the window for the
1124          * race (where both threads read and update the counter at the same
1125          * time) is incredibly small.
1126          * And, lastly, we'd like to make this into a "random" key XXX
1127          */
1128         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt))
1129         log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1130         tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1131         *key = tmp | indx;
1132 }
1133 
1134 
1135 /*
1136  * tavor_mr_common_reg()
1137  *    Context: Can be called from interrupt or base context.
1138  */
1139 static int
1140 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1141     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1142 {
1143         tavor_rsrc_pool_info_t  *rsrc_pool;
1144         tavor_rsrc_t            *mpt, *mtt, *rsrc, *mtt_refcnt;
1145         tavor_umap_db_entry_t   *umapdb;
1146         tavor_sw_refcnt_t       *swrc_tmp;
1147         tavor_hw_mpt_t          mpt_entry;
1148         tavor_mrhdl_t           mr;
1149         ibt_mr_flags_t          flags;
1150         tavor_bind_info_t       *bh;
1151         ddi_dma_handle_t        bind_dmahdl;
1152         ddi_umem_cookie_t       umem_cookie;
1153         size_t                  umem_len;
1154         caddr_t                 umem_addr;
1155         uint64_t                mtt_addr, mtt_ddrbaseaddr, max_sz;
1156         uint_t                  sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1157         int                     status, umem_flags, bind_override_addr;
1158         char                    *errormsg;
1159 
1160         TAVOR_TNF_ENTER(tavor_mr_common_reg);
1161 
1162         /*
1163          * Check the "options" flag.  Currently this flag tells the driver
1164          * whether or not the region should be bound normally (i.e. with
1165          * entries written into the PCI IOMMU), whether it should be
1166          * registered to bypass the IOMMU, and whether or not the resulting
1167          * address should be "zero-based" (to aid the alignment restrictions
1168          * for QPs).
1169          */
1170         if (op == NULL) {
1171                 bind_type   = TAVOR_BINDMEM_NORMAL;
1172                 bind_dmahdl = NULL;
1173                 bind_override_addr = 0;
1174         } else {
1175                 bind_type          = op->mro_bind_type;
1176                 bind_dmahdl        = op->mro_bind_dmahdl;
1177                 bind_override_addr = op->mro_bind_override_addr;
1178         }
1179 
1180         /* Extract the flags field from the tavor_bind_info_t */
1181         flags = bind->bi_flags;
1182 
1183         /*
1184          * Check for invalid length.  Check is the length is zero or if the
1185          * length is larger than the maximum configured value.  Return error
1186          * if it is.
1187          */
1188         max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1189         if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1190                 /* Set "status" and "errormsg" and goto failure */
1191                 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1192                 goto mrcommon_fail;
1193         }
1194 
1195         /*
1196          * Check the sleep flag.  Ensure that it is consistent with the
1197          * current thread context (i.e. if we are currently in the interrupt
1198          * context, then we shouldn't be attempting to sleep).
1199          */
1200         sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1201         if ((sleep == TAVOR_SLEEP) &&
1202             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1203                 /* Set "status" and "errormsg" and goto failure */
1204                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1205                 goto mrcommon_fail;
1206         }
1207 
1208         /*
1209          * Get the base address for the MTT table.  This will be necessary
1210          * below when we are setting up the MPT entry.
1211          */
1212         rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1213         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1214 
1215         /* Increment the reference count on the protection domain (PD) */
1216         tavor_pd_refcnt_inc(pd);
1217 
1218         /*
1219          * Allocate an MPT entry.  This will be filled in with all the
1220          * necessary parameters to define the memory region.  And then
1221          * ownership will be passed to the hardware in the final step
1222          * below.  If we fail here, we must undo the protection domain
1223          * reference count.
1224          */
1225         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1226         if (status != DDI_SUCCESS) {
1227                 /* Set "status" and "errormsg" and goto failure */
1228                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1229                 goto mrcommon_fail1;
1230         }
1231 
1232         /*
1233          * Allocate the software structure for tracking the memory region (i.e.
1234          * the Tavor Memory Region handle).  If we fail here, we must undo
1235          * the protection domain reference count and the previous resource
1236          * allocation.
1237          */
1238         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1239         if (status != DDI_SUCCESS) {
1240                 /* Set "status" and "errormsg" and goto failure */
1241                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1242                 goto mrcommon_fail2;
1243         }
1244         mr = (tavor_mrhdl_t)rsrc->tr_addr;
1245         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1246 
1247         /*
1248          * Setup and validate the memory region access flags.  This means
1249          * translating the IBTF's enable flags into the access flags that
1250          * will be used in later operations.
1251          */
1252         mr->mr_accflag = 0;
1253         if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1254                 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1255         if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1256                 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1257         if (flags & IBT_MR_ENABLE_REMOTE_READ)
1258                 mr->mr_accflag |= IBT_MR_REMOTE_READ;
1259         if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1260                 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1261         if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1262                 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1263 
1264         /*
1265          * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1266          * from a certain number of "constrained" bits (the least significant
1267          * bits) and some number of "unconstrained" bits.  The constrained
1268          * bits must be set to the index of the entry in the MPT table, but
1269          * the unconstrained bits can be set to any value we wish.  Note:
1270          * if no remote access is required, then the RKey value is not filled
1271          * in.  Otherwise both Rkey and LKey are given the same value.
1272          */
1273         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1274         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1275             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1276             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1277                 mr->mr_rkey = mr->mr_lkey;
1278         }
1279 
1280         /*
1281          * Determine if the memory is from userland and pin the pages
1282          * with umem_lockmemory() if necessary.
1283          * Then, if this is userland memory, allocate an entry in the
1284          * "userland resources database".  This will later be added to
1285          * the database (after all further memory registration operations are
1286          * successful).  If we fail here, we must undo the reference counts
1287          * and the previous resource allocations.
1288          */
1289         mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1290         if (mr_is_umem) {
1291                 umem_len   = ptob(btopr(bind->bi_len +
1292                     ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1293                 umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1294                 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1295                     DDI_UMEMLOCK_LONGTERM);
1296                 status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1297                     &umem_cookie, &tavor_umem_cbops, NULL);
1298                 if (status != 0) {
1299                         /* Set "status" and "errormsg" and goto failure */
1300                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
1301                         goto mrcommon_fail3;
1302                 }
1303 
1304                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1305                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1306 
1307                 bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1308                     B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1309                 if (bind->bi_buf == NULL) {
1310                         /* Set "status" and "errormsg" and goto failure */
1311                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup");
1312                         goto mrcommon_fail3;
1313                 }
1314                 bind->bi_type = TAVOR_BINDHDL_UBUF;
1315                 bind->bi_buf->b_flags |= B_READ;
1316 
1317                 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1318                 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1319 
1320                 umapdb = tavor_umap_db_alloc(state->ts_instance,
1321                     (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1322                     (uint64_t)(uintptr_t)rsrc);
1323                 if (umapdb == NULL) {
1324                         /* Set "status" and "errormsg" and goto failure */
1325                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
1326                         goto mrcommon_fail4;
1327                 }
1328         }
1329 
1330         /*
1331          * Setup the bindinfo for the mtt bind call
1332          */
1333         bh = &mr->mr_bindinfo;
1334         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1335         bcopy(bind, bh, sizeof (tavor_bind_info_t));
1336         bh->bi_bypass = bind_type;
1337         status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1338             &mtt_pgsize_bits);
1339         if (status != DDI_SUCCESS) {
1340                 /* Set "status" and "errormsg" and goto failure */
1341                 TAVOR_TNF_FAIL(status, "failed mtt bind");
1342                 /*
1343                  * When mtt_bind fails, freerbuf has already been done,
1344                  * so make sure not to call it again.
1345                  */
1346                 bind->bi_type = bh->bi_type;
1347                 goto mrcommon_fail5;
1348         }
1349         mr->mr_logmttpgsz = mtt_pgsize_bits;
1350 
1351         /*
1352          * Allocate MTT reference count (to track shared memory regions).
1353          * This reference count resource may never be used on the given
1354          * memory region, but if it is ever later registered as "shared"
1355          * memory region then this resource will be necessary.  If we fail
1356          * here, we do pretty much the same as above to clean up.
1357          */
1358         status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1359             &mtt_refcnt);
1360         if (status != DDI_SUCCESS) {
1361                 /* Set "status" and "errormsg" and goto failure */
1362                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count");
1363                 goto mrcommon_fail6;
1364         }
1365         mr->mr_mttrefcntp = mtt_refcnt;
1366         swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1367         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1368         TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1369 
1370         /*
1371          * Fill in the MPT entry.  This is the final step before passing
1372          * ownership of the MPT entry to the Tavor hardware.  We use all of
1373          * the information collected/calculated above to fill in the
1374          * requisite portions of the MPT.
1375          */
1376         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1377         mpt_entry.m_io    = TAVOR_MEM_CYCLE_GENERATE;
1378         mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1379         mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1380         mpt_entry.rw      = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1381         mpt_entry.rr      = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1382         mpt_entry.lw      = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1383         mpt_entry.lr      = 1;
1384         mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1385         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
1386         mpt_entry.mem_key       = mr->mr_lkey;
1387         mpt_entry.pd            = pd->pd_pdnum;
1388         if (bind_override_addr == 0) {
1389                 mpt_entry.start_addr = bh->bi_addr;
1390         } else {
1391                 bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1392                 mpt_entry.start_addr = bh->bi_addr;
1393         }
1394         mpt_entry.reg_win_len   = bh->bi_len;
1395         mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1396         mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1397         mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1398         mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1399 
1400         /*
1401          * Write the MPT entry to hardware.  Lastly, we pass ownership of
1402          * the entry to the hardware.  Note: in general, this operation
1403          * shouldn't fail.  But if it does, we have to undo everything we've
1404          * done above before returning error.
1405          */
1406         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1407             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1408         if (status != TAVOR_CMD_SUCCESS) {
1409                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1410                     status);
1411                 TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
1412                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1413                 /* Set "status" and "errormsg" and goto failure */
1414                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1415                     "tavor SW2HW_MPT command");
1416                 goto mrcommon_fail7;
1417         }
1418 
1419         /*
1420          * Fill in the rest of the Tavor Memory Region handle.  Having
1421          * successfully transferred ownership of the MPT, we can update the
1422          * following fields for use in further operations on the MR.
1423          */
1424         mr->mr_mptrsrcp        = mpt;
1425         mr->mr_mttrsrcp        = mtt;
1426         mr->mr_pdhdl   = pd;
1427         mr->mr_rsrcp   = rsrc;
1428         mr->mr_is_umem         = mr_is_umem;
1429         mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1430         mr->mr_umem_cbfunc = NULL;
1431         mr->mr_umem_cbarg1 = NULL;
1432         mr->mr_umem_cbarg2 = NULL;
1433 
1434         /*
1435          * If this is userland memory, then we need to insert the previously
1436          * allocated entry into the "userland resources database".  This will
1437          * allow for later coordination between the tavor_umap_umemlock_cb()
1438          * callback and tavor_mr_deregister().
1439          */
1440         if (mr_is_umem) {
1441                 tavor_umap_db_add(umapdb);
1442         }
1443 
1444         *mrhdl = mr;
1445 
1446         TAVOR_TNF_EXIT(tavor_mr_common_reg);
1447         return (DDI_SUCCESS);
1448 
1449 /*
1450  * The following is cleanup for all possible failure cases in this routine
1451  */
1452 mrcommon_fail7:
1453         tavor_rsrc_free(state, &mtt_refcnt);
1454 mrcommon_fail6:
1455         tavor_rsrc_free(state, &mtt);
1456         tavor_mr_mem_unbind(state, bh);
1457         bind->bi_type = bh->bi_type;
1458 mrcommon_fail5:
1459         if (mr_is_umem) {
1460                 tavor_umap_db_free(umapdb);
1461         }
1462 mrcommon_fail4:
1463         if (mr_is_umem) {
1464                 /*
1465                  * Free up the memory ddi_umem_iosetup() allocates
1466                  * internally.
1467                  */
1468                 if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1469                         freerbuf(bind->bi_buf);
1470                         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1471                         bind->bi_type = TAVOR_BINDHDL_NONE;
1472                         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1473                 }
1474                 ddi_umem_unlock(umem_cookie);
1475         }
1476 mrcommon_fail3:
1477         tavor_rsrc_free(state, &rsrc);
1478 mrcommon_fail2:
1479         tavor_rsrc_free(state, &mpt);
1480 mrcommon_fail1:
1481         tavor_pd_refcnt_dec(pd);
1482 mrcommon_fail:
1483         TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "",
1484             tnf_string, msg, errormsg);
1485         TAVOR_TNF_EXIT(tavor_mr_common_reg);
1486         return (status);
1487 }
1488 
1489 int
1490 tavor_dma_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
1491     ibt_dmr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl)
1492 {
1493         tavor_rsrc_t            *mpt, *rsrc;
1494         tavor_hw_mpt_t          mpt_entry;
1495         tavor_mrhdl_t           mr;
1496         ibt_mr_flags_t          flags;
1497         uint_t                  sleep;
1498         int                     status;
1499 
1500         /* Extract the flags field */
1501         flags = mr_attr->dmr_flags;
1502 
1503         /*
1504          * Check the sleep flag.  Ensure that it is consistent with the
1505          * current thread context (i.e. if we are currently in the interrupt
1506          * context, then we shouldn't be attempting to sleep).
1507          */
1508         sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1509         if ((sleep == TAVOR_SLEEP) &&
1510             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1511                 status = IBT_INVALID_PARAM;
1512                 goto mrcommon_fail;
1513         }
1514 
1515         /* Increment the reference count on the protection domain (PD) */
1516         tavor_pd_refcnt_inc(pd);
1517 
1518         /*
1519          * Allocate an MPT entry.  This will be filled in with all the
1520          * necessary parameters to define the memory region.  And then
1521          * ownership will be passed to the hardware in the final step
1522          * below.  If we fail here, we must undo the protection domain
1523          * reference count.
1524          */
1525         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1526         if (status != DDI_SUCCESS) {
1527                 status = IBT_INSUFF_RESOURCE;
1528                 goto mrcommon_fail1;
1529         }
1530 
1531         /*
1532          * Allocate the software structure for tracking the memory region (i.e.
1533          * the Tavor Memory Region handle).  If we fail here, we must undo
1534          * the protection domain reference count and the previous resource
1535          * allocation.
1536          */
1537         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1538         if (status != DDI_SUCCESS) {
1539                 status = IBT_INSUFF_RESOURCE;
1540                 goto mrcommon_fail2;
1541         }
1542         mr = (tavor_mrhdl_t)rsrc->tr_addr;
1543         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1544         bzero(mr, sizeof (*mr));
1545 
1546         /*
1547          * Setup and validate the memory region access flags.  This means
1548          * translating the IBTF's enable flags into the access flags that
1549          * will be used in later operations.
1550          */
1551         mr->mr_accflag = 0;
1552         if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1553                 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1554         if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1555                 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1556         if (flags & IBT_MR_ENABLE_REMOTE_READ)
1557                 mr->mr_accflag |= IBT_MR_REMOTE_READ;
1558         if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1559                 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1560         if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1561                 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1562 
1563         /*
1564          * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1565          * from a certain number of "constrained" bits (the least significant
1566          * bits) and some number of "unconstrained" bits.  The constrained
1567          * bits must be set to the index of the entry in the MPT table, but
1568          * the unconstrained bits can be set to any value we wish.  Note:
1569          * if no remote access is required, then the RKey value is not filled
1570          * in.  Otherwise both Rkey and LKey are given the same value.
1571          */
1572         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1573         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1574             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1575             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1576                 mr->mr_rkey = mr->mr_lkey;
1577         }
1578 
1579         /*
1580          * Fill in the MPT entry.  This is the final step before passing
1581          * ownership of the MPT entry to the Tavor hardware.  We use all of
1582          * the information collected/calculated above to fill in the
1583          * requisite portions of the MPT.
1584          */
1585         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1586 
1587         mpt_entry.m_io    = TAVOR_MEM_CYCLE_GENERATE;
1588         mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1589         mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1590         mpt_entry.rw      = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1591         mpt_entry.rr      = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1592         mpt_entry.lw      = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1593         mpt_entry.lr      = 1;
1594         mpt_entry.phys_addr = 1;        /* critical bit for this */
1595         mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1596 
1597         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
1598         mpt_entry.mem_key       = mr->mr_lkey;
1599         mpt_entry.pd            = pd->pd_pdnum;
1600         mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1601 
1602         mpt_entry.start_addr = mr_attr->dmr_paddr;
1603         mpt_entry.reg_win_len = mr_attr->dmr_len;
1604 
1605         mpt_entry.mttseg_addr_h = 0;
1606         mpt_entry.mttseg_addr_l = 0;
1607 
1608         /*
1609          * Write the MPT entry to hardware.  Lastly, we pass ownership of
1610          * the entry to the hardware if needed.  Note: in general, this
1611          * operation shouldn't fail.  But if it does, we have to undo
1612          * everything we've done above before returning error.
1613          *
1614          * For Tavor, this routine (which is common to the contexts) will only
1615          * set the ownership if needed - the process of passing the context
1616          * itself to HW will take care of setting up the MPT (based on type
1617          * and index).
1618          */
1619 
1620         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1621             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1622         if (status != TAVOR_CMD_SUCCESS) {
1623                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1624                     status);
1625                 status = ibc_get_ci_failure(0);
1626                 goto mrcommon_fail7;
1627         }
1628 
1629         /*
1630          * Fill in the rest of the Tavor Memory Region handle.  Having
1631          * successfully transferred ownership of the MPT, we can update the
1632          * following fields for use in further operations on the MR.
1633          */
1634         mr->mr_mptrsrcp         = mpt;
1635         mr->mr_mttrsrcp         = NULL;
1636         mr->mr_pdhdl    = pd;
1637         mr->mr_rsrcp    = rsrc;
1638         mr->mr_is_umem          = 0;
1639         mr->mr_umemcookie  = NULL;
1640         mr->mr_umem_cbfunc = NULL;
1641         mr->mr_umem_cbarg1 = NULL;
1642         mr->mr_umem_cbarg2 = NULL;
1643 
1644         *mrhdl = mr;
1645 
1646         return (DDI_SUCCESS);
1647 
1648 /*
1649  * The following is cleanup for all possible failure cases in this routine
1650  */
1651 mrcommon_fail7:
1652         tavor_rsrc_free(state, &rsrc);
1653 mrcommon_fail2:
1654         tavor_rsrc_free(state, &mpt);
1655 mrcommon_fail1:
1656         tavor_pd_refcnt_dec(pd);
1657 mrcommon_fail:
1658         return (status);
1659 }
1660 
1661 /*
1662  * tavor_mr_mtt_bind()
1663  *    Context: Can be called from interrupt or base context.
1664  */
1665 int
1666 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1667     ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1668 {
1669         uint64_t                nummtt;
1670         uint_t                  sleep;
1671         int                     status;
1672         char                    *errormsg;
1673 
1674         TAVOR_TNF_ENTER(tavor_mr_common_reg);
1675 
1676         /*
1677          * Check the sleep flag.  Ensure that it is consistent with the
1678          * current thread context (i.e. if we are currently in the interrupt
1679          * context, then we shouldn't be attempting to sleep).
1680          */
1681         sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1682         if ((sleep == TAVOR_SLEEP) &&
1683             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1684                 /* Set "status" and "errormsg" and goto failure */
1685                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1686                 goto mrmttbind_fail;
1687         }
1688 
1689         /*
1690          * Bind the memory and determine the mapped addresses.  This is
1691          * the first of two routines that do all the "heavy lifting" for
1692          * the Tavor memory registration routines.  The tavor_mr_mem_bind()
1693          * routine takes the "bind" struct with all its fields filled
1694          * in and returns a list of DMA cookies (for the PCI mapped addresses
1695          * corresponding to the specified address region) which are used by
1696          * the tavor_mr_fast_mtt_write() routine below.  If we fail here, we
1697          * must undo all the previous resource allocation (and PD reference
1698          * count).
1699          */
1700         status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1701         if (status != DDI_SUCCESS) {
1702                 /* Set "status" and "errormsg" and goto failure */
1703                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
1704                 goto mrmttbind_fail;
1705         }
1706 
1707         /*
1708          * Determine number of pages spanned.  This routine uses the
1709          * information in the "bind" struct to determine the required
1710          * number of MTT entries needed (and returns the suggested page size -
1711          * as a "power-of-2" - for each MTT entry).
1712          */
1713         nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1714 
1715         /*
1716          * Allocate the MTT entries.  Use the calculations performed above to
1717          * allocate the required number of MTT entries.  Note: MTT entries are
1718          * allocated in "MTT segments" which consist of complete cachelines
1719          * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
1720          * macro is used to do the proper conversion.  If we fail here, we
1721          * must not only undo all the previous resource allocation (and PD
1722          * reference count), but we must also unbind the memory.
1723          */
1724         status = tavor_rsrc_alloc(state, TAVOR_MTT,
1725             TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1726         if (status != DDI_SUCCESS) {
1727                 /* Set "status" and "errormsg" and goto failure */
1728                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
1729                 goto mrmttbind_fail2;
1730         }
1731 
1732         /*
1733          * Write the mapped addresses into the MTT entries.  This is part two
1734          * of the "heavy lifting" routines that we talked about above.  Note:
1735          * we pass the suggested page size from the earlier operation here.
1736          * And if we fail here, we again do pretty much the same huge clean up.
1737          */
1738         status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
1739         if (status != DDI_SUCCESS) {
1740                 /* Set "status" and "errormsg" and goto failure */
1741                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
1742                 goto mrmttbind_fail3;
1743         }
1744         TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1745         return (DDI_SUCCESS);
1746 
1747 /*
1748  * The following is cleanup for all possible failure cases in this routine
1749  */
1750 mrmttbind_fail3:
1751         tavor_rsrc_free(state, mtt);
1752 mrmttbind_fail2:
1753         tavor_mr_mem_unbind(state, bind);
1754 mrmttbind_fail:
1755         TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "",
1756             tnf_string, msg, errormsg);
1757         TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1758         return (status);
1759 }
1760 
1761 
1762 /*
1763  * tavor_mr_mtt_unbind()
1764  *    Context: Can be called from interrupt or base context.
1765  */
1766 int
1767 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
1768     tavor_rsrc_t *mtt)
1769 {
1770         TAVOR_TNF_ENTER(tavor_mr_mtt_unbind);
1771 
1772         /*
1773          * Free up the MTT entries and unbind the memory.  Here, as above, we
1774          * attempt to free these resources only if it is appropriate to do so.
1775          */
1776         tavor_mr_mem_unbind(state, bind);
1777         tavor_rsrc_free(state, &mtt);
1778 
1779         TAVOR_TNF_EXIT(tavor_mr_mtt_unbind);
1780         return (DDI_SUCCESS);
1781 }
1782 
1783 
1784 /*
1785  * tavor_mr_common_rereg()
1786  *    Context: Can be called from interrupt or base context.
1787  */
1788 static int
1789 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
1790     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
1791     tavor_mr_options_t *op)
1792 {
1793         tavor_rsrc_t            *mpt;
1794         ibt_mr_attr_flags_t     acc_flags_to_use;
1795         ibt_mr_flags_t          flags;
1796         tavor_pdhdl_t           pd_to_use;
1797         tavor_hw_mpt_t          mpt_entry;
1798         uint64_t                mtt_addr_to_use, vaddr_to_use, len_to_use;
1799         uint_t                  sleep, dereg_level;
1800         int                     status;
1801         char                    *errormsg;
1802 
1803         TAVOR_TNF_ENTER(tavor_mr_common_rereg);
1804 
1805         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1806 
1807         /*
1808          * Check here to see if the memory region corresponds to a userland
1809          * mapping.  Reregistration of userland memory regions is not
1810          * currently supported.  Return failure. XXX
1811          */
1812         if (mr->mr_is_umem) {
1813                 /* Set "status" and "errormsg" and goto failure */
1814                 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
1815                 goto mrrereg_fail;
1816         }
1817 
1818         mutex_enter(&mr->mr_lock);
1819 
1820         /* Pull MPT resource pointer from the Tavor Memory Region handle */
1821         mpt = mr->mr_mptrsrcp;
1822 
1823         /* Extract the flags field from the tavor_bind_info_t */
1824         flags = bind->bi_flags;
1825 
1826         /*
1827          * Check the sleep flag.  Ensure that it is consistent with the
1828          * current thread context (i.e. if we are currently in the interrupt
1829          * context, then we shouldn't be attempting to sleep).
1830          */
1831         sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1832         if ((sleep == TAVOR_SLEEP) &&
1833             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1834                 mutex_exit(&mr->mr_lock);
1835                 /* Set "status" and "errormsg" and goto failure */
1836                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1837                 goto mrrereg_fail;
1838         }
1839 
1840         /*
1841          * First step is to temporarily invalidate the MPT entry.  This
1842          * regains ownership from the hardware, and gives us the opportunity
1843          * to modify the entry.  Note: The HW2SW_MPT command returns the
1844          * current MPT entry contents.  These are saved away here because
1845          * they will be reused in a later step below.  If the region has
1846          * bound memory windows that we fail returning an "in use" error code.
1847          * Otherwise, this is an unexpected error and we deregister the
1848          * memory region and return error.
1849          *
1850          * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1851          * against holding the lock around this rereg call in all contexts.
1852          */
1853         status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
1854             sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1855         if (status != TAVOR_CMD_SUCCESS) {
1856                 mutex_exit(&mr->mr_lock);
1857                 if (status == TAVOR_CMD_REG_BOUND) {
1858                         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1859                         return (IBT_MR_IN_USE);
1860                 } else {
1861                         cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
1862                             "%08x\n", status);
1863 
1864                         /*
1865                          * Call deregister and ensure that all current
1866                          * resources get freed up
1867                          */
1868                         if (tavor_mr_deregister(state, &mr,
1869                             TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
1870                                 TAVOR_WARNING(state, "failed to deregister "
1871                                     "memory region");
1872                         }
1873                         TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail,
1874                             TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1875                         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1876                         return (ibc_get_ci_failure(0));
1877                 }
1878         }
1879 
1880         /*
1881          * If we're changing the protection domain, then validate the new one
1882          */
1883         if (flags & IBT_MR_CHANGE_PD) {
1884 
1885                 /* Check for valid PD handle pointer */
1886                 if (pd == NULL) {
1887                         mutex_exit(&mr->mr_lock);
1888                         /*
1889                          * Call deregister and ensure that all current
1890                          * resources get properly freed up. Unnecessary
1891                          * here to attempt to regain software ownership
1892                          * of the MPT entry as that has already been
1893                          * done above.
1894                          */
1895                         if (tavor_mr_deregister(state, &mr,
1896                             TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1897                             DDI_SUCCESS) {
1898                                 TAVOR_WARNING(state, "failed to deregister "
1899                                     "memory region");
1900                         }
1901                         /* Set "status" and "errormsg" and goto failure */
1902                         TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
1903                         goto mrrereg_fail;
1904                 }
1905 
1906                 /* Use the new PD handle in all operations below */
1907                 pd_to_use = pd;
1908 
1909         } else {
1910                 /* Use the current PD handle in all operations below */
1911                 pd_to_use = mr->mr_pdhdl;
1912         }
1913 
1914         /*
1915          * If we're changing access permissions, then validate the new ones
1916          */
1917         if (flags & IBT_MR_CHANGE_ACCESS) {
1918                 /*
1919                  * Validate the access flags.  Both remote write and remote
1920                  * atomic require the local write flag to be set
1921                  */
1922                 if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
1923                     (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
1924                     !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
1925                         mutex_exit(&mr->mr_lock);
1926                         /*
1927                          * Call deregister and ensure that all current
1928                          * resources get properly freed up. Unnecessary
1929                          * here to attempt to regain software ownership
1930                          * of the MPT entry as that has already been
1931                          * done above.
1932                          */
1933                         if (tavor_mr_deregister(state, &mr,
1934                             TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1935                             DDI_SUCCESS) {
1936                                 TAVOR_WARNING(state, "failed to deregister "
1937                                     "memory region");
1938                         }
1939                         /* Set "status" and "errormsg" and goto failure */
1940                         TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID,
1941                             "invalid access flags");
1942                         goto mrrereg_fail;
1943                 }
1944 
1945                 /*
1946                  * Setup and validate the memory region access flags.  This
1947                  * means translating the IBTF's enable flags into the access
1948                  * flags that will be used in later operations.
1949                  */
1950                 acc_flags_to_use = 0;
1951                 if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1952                         acc_flags_to_use |= IBT_MR_WINDOW_BIND;
1953                 if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1954                         acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
1955                 if (flags & IBT_MR_ENABLE_REMOTE_READ)
1956                         acc_flags_to_use |= IBT_MR_REMOTE_READ;
1957                 if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1958                         acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
1959                 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1960                         acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
1961 
1962         } else {
1963                 acc_flags_to_use = mr->mr_accflag;
1964         }
1965 
1966         /*
1967          * If we're modifying the translation, then figure out whether
1968          * we can reuse the current MTT resources.  This means calling
1969          * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
1970          * for the reregistration.  If the current memory region contains
1971          * sufficient MTT entries for the new regions, then it will be
1972          * reused and filled in.  Otherwise, new entries will be allocated,
1973          * the old ones will be freed, and the new entries will be filled
1974          * in.  Note:  If we're not modifying the translation, then we
1975          * should already have all the information we need to update the MPT.
1976          * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
1977          * a "dereg_level" which is the level of cleanup that needs to be
1978          * passed to tavor_mr_deregister() to finish the cleanup.
1979          */
1980         if (flags & IBT_MR_CHANGE_TRANSLATION) {
1981                 status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
1982                     &mtt_addr_to_use, sleep, &dereg_level);
1983                 if (status != DDI_SUCCESS) {
1984                         mutex_exit(&mr->mr_lock);
1985                         /*
1986                          * Call deregister and ensure that all resources get
1987                          * properly freed up.
1988                          */
1989                         if (tavor_mr_deregister(state, &mr, dereg_level,
1990                             sleep) != DDI_SUCCESS) {
1991                                 TAVOR_WARNING(state, "failed to deregister "
1992                                     "memory region");
1993                         }
1994 
1995                         /* Set "status" and "errormsg" and goto failure */
1996                         TAVOR_TNF_FAIL(status, "failed rereg helper");
1997                         goto mrrereg_fail;
1998                 }
1999                 vaddr_to_use = mr->mr_bindinfo.bi_addr;
2000                 len_to_use   = mr->mr_bindinfo.bi_len;
2001         } else {
2002                 mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
2003                     ((uint64_t)mpt_entry.mttseg_addr_l << 6));
2004                 vaddr_to_use = mr->mr_bindinfo.bi_addr;
2005                 len_to_use   = mr->mr_bindinfo.bi_len;
2006         }
2007 
2008         /*
2009          * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2010          * when the region was first registered, each key is formed from
2011          * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2012          * access is required, then the RKey value is not filled in.  Otherwise
2013          * both Rkey and LKey are given the same value.
2014          */
2015         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
2016         if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2017             (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2018             (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2019                 mr->mr_rkey = mr->mr_lkey;
2020         }
2021 
2022         /*
2023          * Update the MPT entry with the new information.  Some of this
2024          * information is retained from the previous operation, some of
2025          * it is new based on request.
2026          */
2027         mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2028         mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2029         mpt_entry.rw      = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2030         mpt_entry.rr      = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2031         mpt_entry.lw      = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2032         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
2033         mpt_entry.mem_key       = mr->mr_lkey;
2034         mpt_entry.pd            = pd_to_use->pd_pdnum;
2035         mpt_entry.start_addr    = vaddr_to_use;
2036         mpt_entry.reg_win_len   = len_to_use;
2037         mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
2038         mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
2039 
2040         /*
2041          * Write the updated MPT entry to hardware
2042          *
2043          * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2044          * against holding the lock around this rereg call in all contexts.
2045          */
2046         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2047             sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2048         if (status != TAVOR_CMD_SUCCESS) {
2049                 mutex_exit(&mr->mr_lock);
2050                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
2051                     status);
2052                 /*
2053                  * Call deregister and ensure that all current resources get
2054                  * properly freed up. Unnecessary here to attempt to regain
2055                  * software ownership of the MPT entry as that has already
2056                  * been done above.
2057                  */
2058                 if (tavor_mr_deregister(state, &mr,
2059                     TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2060                         TAVOR_WARNING(state, "failed to deregister memory "
2061                             "region");
2062                 }
2063                 TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail,
2064                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2065                 TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2066                 return (ibc_get_ci_failure(0));
2067         }
2068 
2069         /*
2070          * If we're changing PD, then update their reference counts now.
2071          * This means decrementing the reference count on the old PD and
2072          * incrementing the reference count on the new PD.
2073          */
2074         if (flags & IBT_MR_CHANGE_PD) {
2075                 tavor_pd_refcnt_dec(mr->mr_pdhdl);
2076                 tavor_pd_refcnt_inc(pd);
2077         }
2078 
2079         /*
2080          * Update the contents of the Tavor Memory Region handle to reflect
2081          * what has been changed.
2082          */
2083         mr->mr_pdhdl   = pd_to_use;
2084         mr->mr_accflag         = acc_flags_to_use;
2085         mr->mr_is_umem         = 0;
2086         mr->mr_umemcookie = NULL;
2087 
2088         /* New MR handle is same as the old */
2089         *mrhdl_new = mr;
2090         mutex_exit(&mr->mr_lock);
2091 
2092         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2093         return (DDI_SUCCESS);
2094 
2095 mrrereg_fail:
2096         TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "",
2097             tnf_string, msg, errormsg);
2098         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2099         return (status);
2100 }
2101 
2102 
2103 /*
2104  * tavor_mr_rereg_xlat_helper
2105  *    Context: Can be called from interrupt or base context.
2106  *    Note: This routine expects the "mr_lock" to be held when it
2107  *    is called.  Upon returning failure, this routine passes information
2108  *    about what "dereg_level" should be passed to tavor_mr_deregister().
2109  */
2110 static int
2111 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
2112     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
2113     uint_t sleep, uint_t *dereg_level)
2114 {
2115         tavor_rsrc_pool_info_t  *rsrc_pool;
2116         tavor_rsrc_t            *mtt, *mtt_refcnt;
2117         tavor_sw_refcnt_t       *swrc_old, *swrc_new;
2118         ddi_dma_handle_t        dmahdl;
2119         uint64_t                nummtt_needed, nummtt_in_currrsrc, max_sz;
2120         uint64_t                mtt_ddrbaseaddr;
2121         uint_t                  mtt_pgsize_bits, bind_type, reuse_dmahdl;
2122         int                     status;
2123         char                    *errormsg;
2124 
2125         TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper);
2126 
2127         ASSERT(MUTEX_HELD(&mr->mr_lock));
2128 
2129         /*
2130          * Check the "options" flag.  Currently this flag tells the driver
2131          * whether or not the region should be bound normally (i.e. with
2132          * entries written into the PCI IOMMU) or whether it should be
2133          * registered to bypass the IOMMU.
2134          */
2135         if (op == NULL) {
2136                 bind_type = TAVOR_BINDMEM_NORMAL;
2137         } else {
2138                 bind_type = op->mro_bind_type;
2139         }
2140 
2141         /*
2142          * Check for invalid length.  Check is the length is zero or if the
2143          * length is larger than the maximum configured value.  Return error
2144          * if it is.
2145          */
2146         max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
2147         if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2148                 /*
2149                  * Deregister will be called upon returning failure from this
2150                  * routine. This will ensure that all current resources get
2151                  * properly freed up. Unnecessary to attempt to regain
2152                  * software ownership of the MPT entry as that has already
2153                  * been done above (in tavor_mr_reregister())
2154                  */
2155                 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
2156 
2157                 /* Set "status" and "errormsg" and goto failure */
2158                 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
2159                 goto mrrereghelp_fail;
2160         }
2161 
2162         /*
2163          * Determine the number of pages necessary for new region and the
2164          * number of pages supported by the current MTT resources
2165          */
2166         nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2167         nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
2168 
2169         /*
2170          * Depending on whether we have enough pages or not, the next step is
2171          * to fill in a set of MTT entries that reflect the new mapping.  In
2172          * the first case below, we already have enough entries.  This means
2173          * we need to unbind the memory from the previous mapping, bind the
2174          * memory for the new mapping, write the new MTT entries, and update
2175          * the mr to reflect the changes.
2176          * In the second case below, we do not have enough entries in the
2177          * current mapping.  So, in this case, we need not only to unbind the
2178          * current mapping, but we need to free up the MTT resources associated
2179          * with that mapping.  After we've successfully done that, we continue
2180          * by binding the new memory, allocating new MTT entries, writing the
2181          * new MTT entries, and updating the mr to reflect the changes.
2182          */
2183 
2184         /*
2185          * If this region is being shared (i.e. MTT refcount != 1), then we
2186          * can't reuse the current MTT resources regardless of their size.
2187          * Instead we'll need to alloc new ones (below) just as if there
2188          * hadn't been enough room in the current entries.
2189          */
2190         swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
2191         if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
2192             (nummtt_needed <= nummtt_in_currrsrc)) {
2193 
2194                 /*
2195                  * Unbind the old mapping for this memory region, but retain
2196                  * the ddi_dma_handle_t (if possible) for reuse in the bind
2197                  * operation below.  Note:  If original memory region was
2198                  * bound for IOMMU bypass and the new region can not use
2199                  * bypass, then a new DMA handle will be necessary.
2200                  */
2201                 if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2202                         mr->mr_bindinfo.bi_free_dmahdl = 0;
2203                         tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2204                         dmahdl = mr->mr_bindinfo.bi_dmahdl;
2205                         reuse_dmahdl = 1;
2206                 } else {
2207                         tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2208                         dmahdl = NULL;
2209                         reuse_dmahdl = 0;
2210                 }
2211 
2212                 /*
2213                  * Bind the new memory and determine the mapped addresses.
2214                  * As described, this routine and tavor_mr_fast_mtt_write()
2215                  * do the majority of the work for the memory registration
2216                  * operations.  Note:  When we successfully finish the binding,
2217                  * we will set the "bi_free_dmahdl" flag to indicate that
2218                  * even though we may have reused the ddi_dma_handle_t we do
2219                  * wish it to be freed up at some later time.  Note also that
2220                  * if we fail, we may need to cleanup the ddi_dma_handle_t.
2221                  */
2222                 bind->bi_bypass      = bind_type;
2223                 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2224                 if (status != DDI_SUCCESS) {
2225                         if (reuse_dmahdl) {
2226                                 ddi_dma_free_handle(&dmahdl);
2227                         }
2228 
2229                         /*
2230                          * Deregister will be called upon returning failure
2231                          * from this routine. This will ensure that all
2232                          * current resources get properly freed up.
2233                          * Unnecessary to attempt to regain software ownership
2234                          * of the MPT entry as that has already been done
2235                          * above (in tavor_mr_reregister()).  Also unnecessary
2236                          * to attempt to unbind the memory.
2237                          */
2238                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2239 
2240                         /* Set "status" and "errormsg" and goto failure */
2241                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2242                         goto mrrereghelp_fail;
2243                 }
2244                 if (reuse_dmahdl) {
2245                         bind->bi_free_dmahdl = 1;
2246                 }
2247 
2248                 /*
2249                  * Using the new mapping, but reusing the current MTT
2250                  * resources, write the updated entries to MTT
2251                  */
2252                 mtt    = mr->mr_mttrsrcp;
2253                 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2254                 if (status != DDI_SUCCESS) {
2255                         /*
2256                          * Deregister will be called upon returning failure
2257                          * from this routine. This will ensure that all
2258                          * current resources get properly freed up.
2259                          * Unnecessary to attempt to regain software ownership
2260                          * of the MPT entry as that has already been done
2261                          * above (in tavor_mr_reregister()).  Also unnecessary
2262                          * to attempt to unbind the memory.
2263                          *
2264                          * But we do need to unbind the newly bound memory
2265                          * before returning.
2266                          */
2267                         tavor_mr_mem_unbind(state, bind);
2268                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2269 
2270                         /* Set "status" and "errormsg" and goto failure */
2271                         TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2272                             "failed write mtt");
2273                         goto mrrereghelp_fail;
2274                 }
2275 
2276                 /* Put the updated information into the Mem Region handle */
2277                 mr->mr_bindinfo        = *bind;
2278                 mr->mr_logmttpgsz = mtt_pgsize_bits;
2279 
2280         } else {
2281                 /*
2282                  * Check if the memory region MTT is shared by any other MRs.
2283                  * Since the resource may be shared between multiple memory
2284                  * regions (as a result of a "RegisterSharedMR()" verb) it is
2285                  * important that we not unbind any resources prematurely.
2286                  */
2287                 if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2288                         /*
2289                          * Unbind the old mapping for this memory region, but
2290                          * retain the ddi_dma_handle_t for reuse in the bind
2291                          * operation below. Note: This can only be done here
2292                          * because the region being reregistered is not
2293                          * currently shared.  Also if original memory region
2294                          * was bound for IOMMU bypass and the new region can
2295                          * not use bypass, then a new DMA handle will be
2296                          * necessary.
2297                          */
2298                         if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2299                                 mr->mr_bindinfo.bi_free_dmahdl = 0;
2300                                 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2301                                 dmahdl = mr->mr_bindinfo.bi_dmahdl;
2302                                 reuse_dmahdl = 1;
2303                         } else {
2304                                 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2305                                 dmahdl = NULL;
2306                                 reuse_dmahdl = 0;
2307                         }
2308                 } else {
2309                         dmahdl = NULL;
2310                         reuse_dmahdl = 0;
2311                 }
2312 
2313                 /*
2314                  * Bind the new memory and determine the mapped addresses.
2315                  * As described, this routine and tavor_mr_fast_mtt_write()
2316                  * do the majority of the work for the memory registration
2317                  * operations.  Note:  When we successfully finish the binding,
2318                  * we will set the "bi_free_dmahdl" flag to indicate that
2319                  * even though we may have reused the ddi_dma_handle_t we do
2320                  * wish it to be freed up at some later time.  Note also that
2321                  * if we fail, we may need to cleanup the ddi_dma_handle_t.
2322                  */
2323                 bind->bi_bypass      = bind_type;
2324                 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2325                 if (status != DDI_SUCCESS) {
2326                         if (reuse_dmahdl) {
2327                                 ddi_dma_free_handle(&dmahdl);
2328                         }
2329 
2330                         /*
2331                          * Deregister will be called upon returning failure
2332                          * from this routine. This will ensure that all
2333                          * current resources get properly freed up.
2334                          * Unnecessary to attempt to regain software ownership
2335                          * of the MPT entry as that has already been done
2336                          * above (in tavor_mr_reregister()).  Also unnecessary
2337                          * to attempt to unbind the memory.
2338                          */
2339                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2340 
2341                         /* Set "status" and "errormsg" and goto failure */
2342                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2343                         goto mrrereghelp_fail;
2344                 }
2345                 if (reuse_dmahdl) {
2346                         bind->bi_free_dmahdl = 1;
2347                 }
2348 
2349                 /*
2350                  * Allocate the new MTT entries resource
2351                  */
2352                 status = tavor_rsrc_alloc(state, TAVOR_MTT,
2353                     TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2354                 if (status != DDI_SUCCESS) {
2355                         /*
2356                          * Deregister will be called upon returning failure
2357                          * from this routine. This will ensure that all
2358                          * current resources get properly freed up.
2359                          * Unnecessary to attempt to regain software ownership
2360                          * of the MPT entry as that has already been done
2361                          * above (in tavor_mr_reregister()).  Also unnecessary
2362                          * to attempt to unbind the memory.
2363                          *
2364                          * But we do need to unbind the newly bound memory
2365                          * before returning.
2366                          */
2367                         tavor_mr_mem_unbind(state, bind);
2368                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2369 
2370                         /* Set "status" and "errormsg" and goto failure */
2371                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
2372                         goto mrrereghelp_fail;
2373                 }
2374 
2375                 /*
2376                  * Allocate MTT reference count (to track shared memory
2377                  * regions).  As mentioned elsewhere above, this reference
2378                  * count resource may never be used on the given memory region,
2379                  * but if it is ever later registered as a "shared" memory
2380                  * region then this resource will be necessary.  Note:  This
2381                  * is only necessary here if the existing memory region is
2382                  * already being shared (because otherwise we already have
2383                  * a useable reference count resource).
2384                  */
2385                 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2386                         status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2387                             sleep, &mtt_refcnt);
2388                         if (status != DDI_SUCCESS) {
2389                                 /*
2390                                  * Deregister will be called upon returning
2391                                  * failure from this routine. This will ensure
2392                                  * that all current resources get properly
2393                                  * freed up.  Unnecessary to attempt to regain
2394                                  * software ownership of the MPT entry as that
2395                                  * has already been done above (in
2396                                  * tavor_mr_reregister()).  Also unnecessary
2397                                  * to attempt to unbind the memory.
2398                                  *
2399                                  * But we need to unbind the newly bound
2400                                  * memory and free up the newly allocated MTT
2401                                  * entries before returning.
2402                                  */
2403                                 tavor_mr_mem_unbind(state, bind);
2404                                 tavor_rsrc_free(state, &mtt);
2405                                 *dereg_level =
2406                                     TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2407 
2408                                 /* Set "status"/"errormsg", goto failure */
2409                                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
2410                                     "failed reference count");
2411                                 goto mrrereghelp_fail;
2412                         }
2413                         swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2414                         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2415                         TAVOR_MTT_REFCNT_INIT(swrc_new);
2416                 } else {
2417                         mtt_refcnt = mr->mr_mttrefcntp;
2418                 }
2419 
2420                 /*
2421                  * Using the new mapping and the new MTT resources, write the
2422                  * updated entries to MTT
2423                  */
2424                 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2425                 if (status != DDI_SUCCESS) {
2426                         /*
2427                          * Deregister will be called upon returning failure
2428                          * from this routine. This will ensure that all
2429                          * current resources get properly freed up.
2430                          * Unnecessary to attempt to regain software ownership
2431                          * of the MPT entry as that has already been done
2432                          * above (in tavor_mr_reregister()).  Also unnecessary
2433                          * to attempt to unbind the memory.
2434                          *
2435                          * But we need to unbind the newly bound memory,
2436                          * free up the newly allocated MTT entries, and
2437                          * (possibly) free the new MTT reference count
2438                          * resource before returning.
2439                          */
2440                         if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2441                                 tavor_rsrc_free(state, &mtt_refcnt);
2442                         }
2443                         tavor_mr_mem_unbind(state, bind);
2444                         tavor_rsrc_free(state, &mtt);
2445                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2446 
2447                         /* Set "status" and "errormsg" and goto failure */
2448                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt");
2449                         goto mrrereghelp_fail;
2450                 }
2451 
2452                 /*
2453                  * Check if the memory region MTT is shared by any other MRs.
2454                  * Since the resource may be shared between multiple memory
2455                  * regions (as a result of a "RegisterSharedMR()" verb) it is
2456                  * important that we not free up any resources prematurely.
2457                  */
2458                 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2459                         /* Decrement MTT reference count for "old" region */
2460                         (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2461                 } else {
2462                         /* Free up the old MTT entries resource */
2463                         tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2464                 }
2465 
2466                 /* Put the updated information into the mrhdl */
2467                 mr->mr_bindinfo        = *bind;
2468                 mr->mr_logmttpgsz = mtt_pgsize_bits;
2469                 mr->mr_mttrsrcp   = mtt;
2470                 mr->mr_mttrefcntp = mtt_refcnt;
2471         }
2472 
2473         /*
2474          * Calculate and return the updated MTT address (in the DDR address
2475          * space).  This will be used by the caller (tavor_mr_reregister) in
2476          * the updated MPT entry
2477          */
2478         rsrc_pool       = &state->ts_rsrc_hdl[TAVOR_MTT];
2479         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2480         *mtt_addr       = mtt_ddrbaseaddr + (mtt->tr_indx <<
2481             TAVOR_MTT_SIZE_SHIFT);
2482 
2483         TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2484         return (DDI_SUCCESS);
2485 
2486 mrrereghelp_fail:
2487         TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "",
2488             tnf_string, msg, errormsg);
2489         TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2490         return (status);
2491 }
2492 
2493 
2494 /*
2495  * tavor_mr_nummtt_needed()
2496  *    Context: Can be called from interrupt or base context.
2497  */
2498 /* ARGSUSED */
2499 static uint64_t
2500 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2501     uint_t *mtt_pgsize_bits)
2502 {
2503         uint64_t        pg_offset_mask;
2504         uint64_t        pg_offset, tmp_length;
2505 
2506         /*
2507          * For now we specify the page size as 8Kb (the default page size for
2508          * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2509          * size by examining the dmacookies XXX
2510          */
2511         *mtt_pgsize_bits = PAGESHIFT;
2512 
2513         pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2514         pg_offset = bind->bi_addr & pg_offset_mask;
2515         tmp_length = pg_offset + (bind->bi_len - 1);
2516         return ((tmp_length >> *mtt_pgsize_bits) + 1);
2517 }
2518 
2519 
2520 /*
2521  * tavor_mr_mem_bind()
2522  *    Context: Can be called from interrupt or base context.
2523  */
2524 static int
2525 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2526     ddi_dma_handle_t dmahdl, uint_t sleep)
2527 {
2528         ddi_dma_attr_t  dma_attr;
2529         int             (*callback)(caddr_t);
2530         uint_t          dma_xfer_mode;
2531         int             status;
2532 
2533         /* bi_type must be set to a meaningful value to get a bind handle */
2534         ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2535             bind->bi_type == TAVOR_BINDHDL_BUF ||
2536             bind->bi_type == TAVOR_BINDHDL_UBUF);
2537 
2538         TAVOR_TNF_ENTER(tavor_mr_mem_bind);
2539 
2540         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2541 
2542         /* Set the callback flag appropriately */
2543         callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2544 
2545         /* Determine whether to map STREAMING or CONSISTENT */
2546         dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2547             DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2548 
2549         /*
2550          * Initialize many of the default DMA attributes.  Then, if we're
2551          * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2552          */
2553         if (dmahdl == NULL) {
2554                 tavor_dma_attr_init(&dma_attr);
2555 #ifdef  __sparc
2556                 /*
2557                  * First, disable streaming and switch to consistent if
2558                  * configured to do so and IOMMU BYPASS is enabled.
2559                  */
2560                 if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2561                     dma_xfer_mode == DDI_DMA_STREAMING &&
2562                     bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2563                         dma_xfer_mode = DDI_DMA_CONSISTENT;
2564                 }
2565 
2566                 /*
2567                  * Then, if streaming is still specified, then "bypass" is not
2568                  * allowed.
2569                  */
2570                 if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2571                     (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2572                         dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2573                 }
2574 #endif
2575                 /* Allocate a DMA handle for the binding */
2576                 status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2577                     callback, NULL, &bind->bi_dmahdl);
2578                 if (status != DDI_SUCCESS) {
2579                         TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail,
2580                             TAVOR_TNF_ERROR, "");
2581                         TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2582                         return (status);
2583                 }
2584                 bind->bi_free_dmahdl = 1;
2585 
2586         } else  {
2587                 bind->bi_dmahdl = dmahdl;
2588                 bind->bi_free_dmahdl = 0;
2589         }
2590 
2591         /*
2592          * Bind the memory to get the PCI mapped addresses.  The decision
2593          * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2594          * is determined by the "bi_type" flag.  Note: if the bind operation
2595          * fails then we have to free up the DMA handle and return error.
2596          */
2597         if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2598                 status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2599                     (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2600                     (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2601                     &bind->bi_dmacookie, &bind->bi_cookiecnt);
2602         } else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2603                 status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2604                     bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2605                     NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2606         }
2607 
2608         if (status != DDI_DMA_MAPPED) {
2609                 if (bind->bi_free_dmahdl != 0) {
2610                         ddi_dma_free_handle(&bind->bi_dmahdl);
2611                 }
2612                 TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR,
2613                     "");
2614                 TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2615                 return (status);
2616         }
2617 
2618         TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2619         return (DDI_SUCCESS);
2620 }
2621 
2622 
2623 /*
2624  * tavor_mr_mem_unbind()
2625  *    Context: Can be called from interrupt or base context.
2626  */
2627 static void
2628 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2629 {
2630         int     status;
2631 
2632         TAVOR_TNF_ENTER(tavor_mr_mem_unbind);
2633 
2634         /*
2635          * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2636          * is actually allocated by ddi_umem_iosetup() internally, then
2637          * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2638          * not to free it again later.
2639          */
2640         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2641         if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2642                 freerbuf(bind->bi_buf);
2643                 bind->bi_type = TAVOR_BINDHDL_NONE;
2644         }
2645         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2646 
2647         /*
2648          * Unbind the DMA memory for the region
2649          *
2650          * Note: The only way ddi_dma_unbind_handle() currently
2651          * can return an error is if the handle passed in is invalid.
2652          * Since this should never happen, we choose to return void
2653          * from this function!  If this does return an error, however,
2654          * then we print a warning message to the console.
2655          */
2656         status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2657         if (status != DDI_SUCCESS) {
2658                 TAVOR_WARNING(state, "failed to unbind DMA mapping");
2659                 TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail,
2660                     TAVOR_TNF_ERROR, "");
2661                 TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2662                 return;
2663         }
2664 
2665         /* Free up the DMA handle */
2666         if (bind->bi_free_dmahdl != 0) {
2667                 ddi_dma_free_handle(&bind->bi_dmahdl);
2668         }
2669 
2670         TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2671 }
2672 
2673 
2674 /*
2675  * tavor_mr_fast_mtt_write()
2676  *    Context: Can be called from interrupt or base context.
2677  */
2678 static int
2679 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2680     uint32_t mtt_pgsize_bits)
2681 {
2682         ddi_dma_cookie_t        dmacookie;
2683         uint_t                  cookie_cnt;
2684         uint64_t                *mtt_table;
2685         uint64_t                mtt_entry;
2686         uint64_t                addr, endaddr;
2687         uint64_t                pagesize;
2688         int                     i;
2689 
2690         TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write);
2691 
2692         /* Calculate page size from the suggested value passed in */
2693         pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2694 
2695         /*
2696          * Walk the "cookie list" and fill in the MTT table entries
2697          */
2698         i = 0;
2699         mtt_table  = (uint64_t *)mtt->tr_addr;
2700         dmacookie  = bind->bi_dmacookie;
2701         cookie_cnt = bind->bi_cookiecnt;
2702         while (cookie_cnt-- > 0) {
2703                 addr    = dmacookie.dmac_laddress;
2704                 endaddr = addr + (dmacookie.dmac_size - 1);
2705                 addr    = addr & ~((uint64_t)pagesize - 1);
2706                 while (addr <= endaddr) {
2707                         /*
2708                          * Fill in the mapped addresses (calculated above) and
2709                          * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2710                          */
2711                         mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2712                         ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2713                         addr += pagesize;
2714                         i++;
2715 
2716                         if (addr == 0) {
2717                                 static int do_once = 1;
2718                                 _NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2719                                     do_once))
2720                                 if (do_once) {
2721                                         do_once = 0;
2722                                         cmn_err(CE_NOTE, "probable error in "
2723                                             "dma_cookie address from caller\n");
2724                                 }
2725                                 break;
2726                         }
2727                 }
2728 
2729                 /*
2730                  * When we've reached the end of the current DMA cookie,
2731                  * jump to the next cookie (if there are more)
2732                  */
2733                 if (cookie_cnt != 0) {
2734                         ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2735                 }
2736         }
2737 
2738         TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write);
2739         return (DDI_SUCCESS);
2740 }
2741 
2742 /*
2743  * tavor_mtt_refcnt_inc()
2744  *    Context: Can be called from interrupt or base context.
2745  */
2746 static int
2747 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
2748 {
2749         tavor_sw_refcnt_t *rc;
2750         uint32_t          cnt;
2751 
2752         rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2753 
2754         /* Increment the MTT's reference count */
2755         mutex_enter(&rc->swrc_lock);
2756         TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "",
2757             tnf_uint, refcnt, rc->swrc_refcnt);
2758         cnt = rc->swrc_refcnt++;
2759         mutex_exit(&rc->swrc_lock);
2760 
2761         return (cnt);
2762 }
2763 
2764 
2765 /*
2766  * tavor_mtt_refcnt_dec()
2767  *    Context: Can be called from interrupt or base context.
2768  */
2769 static int
2770 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
2771 {
2772         tavor_sw_refcnt_t *rc;
2773         uint32_t          cnt;
2774 
2775         rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2776 
2777         /* Decrement the MTT's reference count */
2778         mutex_enter(&rc->swrc_lock);
2779         cnt = --rc->swrc_refcnt;
2780         TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "",
2781             tnf_uint, refcnt, rc->swrc_refcnt);
2782         mutex_exit(&rc->swrc_lock);
2783 
2784         return (cnt);
2785 }