1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * tavor_mr.c
  28  *    Tavor Memory Region/Window Routines
  29  *
  30  *    Implements all the routines necessary to provide the requisite memory
  31  *    registration verbs.  These include operations like RegisterMemRegion(),
  32  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
  33  *    etc., that affect Memory Regions.  It also includes the verbs that
  34  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
  35  *    and QueryMemWindow().
  36  */
  37 
  38 #include <sys/types.h>
  39 #include <sys/conf.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/modctl.h>
  43 #include <sys/esunddi.h>
  44 
  45 #include <sys/ib/adapters/tavor/tavor.h>
  46 
  47 
  48 /*
  49  * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
  50  * of Tavor memory keys (LKeys and RKeys)
  51  */
  52 static uint_t tavor_debug_memkey_cnt = 0x00000000;
  53 
  54 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
  55     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
  56 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
  57     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
  58     tavor_mr_options_t *op);
  59 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
  60     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
  61     uint_t sleep, uint_t *dereg_level);
  62 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
  63     tavor_bind_info_t *bind, uint_t *mtt_pgsize);
  64 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
  65     ddi_dma_handle_t dmahdl, uint_t sleep);
  66 static void tavor_mr_mem_unbind(tavor_state_t *state,
  67     tavor_bind_info_t *bind);
  68 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
  69     uint32_t mtt_pgsize_bits);
  70 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
  71 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
  72 
  73 /*
  74  * The Tavor umem_lockmemory() callback ops.  When userland memory is
  75  * registered, these callback ops are specified.  The tavor_umap_umemlock_cb()
  76  * callback will be called whenever the memory for the corresponding
  77  * ddi_umem_cookie_t is being freed.
  78  */
  79 static struct umem_callback_ops tavor_umem_cbops = {
  80         UMEM_CALLBACK_VERSION,
  81         tavor_umap_umemlock_cb,
  82 };
  83 
  84 
  85 /*
  86  * tavor_mr_register()
  87  *    Context: Can be called from interrupt or base context.
  88  */
  89 int
  90 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
  91     ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
  92 {
  93         tavor_bind_info_t       bind;
  94         int                     status;
  95 
  96         TAVOR_TNF_ENTER(tavor_mr_register);
  97 
  98         /*
  99          * Fill in the "bind" struct.  This struct provides the majority
 100          * of the information that will be used to distinguish between an
 101          * "addr" binding (as is the case here) and a "buf" binding (see
 102          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 103          * which does most of the "heavy lifting" for the Tavor memory
 104          * registration routines.
 105          */
 106         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 107         bind.bi_addr  = mr_attr->mr_vaddr;
 108         bind.bi_len   = mr_attr->mr_len;
 109         bind.bi_as    = mr_attr->mr_as;
 110         bind.bi_flags = mr_attr->mr_flags;
 111         status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
 112         if (status != DDI_SUCCESS) {
 113                 TNF_PROBE_0(tavor_mr_register_cmnreg_fail,
 114                     TAVOR_TNF_ERROR, "");
 115                 TAVOR_TNF_EXIT(tavor_mr_register);
 116                 return (status);
 117         }
 118 
 119         TAVOR_TNF_EXIT(tavor_mr_register);
 120         return (DDI_SUCCESS);
 121 }
 122 
 123 
 124 /*
 125  * tavor_mr_register_buf()
 126  *    Context: Can be called from interrupt or base context.
 127  */
 128 int
 129 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
 130     ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
 131     tavor_mr_options_t *op)
 132 {
 133         tavor_bind_info_t       bind;
 134         int                     status;
 135 
 136         TAVOR_TNF_ENTER(tavor_mr_register_buf);
 137 
 138         /*
 139          * Fill in the "bind" struct.  This struct provides the majority
 140          * of the information that will be used to distinguish between an
 141          * "addr" binding (see above) and a "buf" binding (as is the case
 142          * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
 143          * which does most of the "heavy lifting" for the Tavor memory
 144          * registration routines.  Note: We have chosen to provide
 145          * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
 146          * not set).  It is not critical what value we choose here as it need
 147          * only be unique for the given RKey (which will happen by default),
 148          * so the choice here is somewhat arbitrary.
 149          */
 150         bind.bi_type  = TAVOR_BINDHDL_BUF;
 151         bind.bi_buf   = buf;
 152         if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
 153                 bind.bi_addr  = mr_attr->mr_vaddr;
 154         } else {
 155                 bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
 156         }
 157         bind.bi_as    = NULL;
 158         bind.bi_len   = (uint64_t)buf->b_bcount;
 159         bind.bi_flags = mr_attr->mr_flags;
 160         status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
 161         if (status != DDI_SUCCESS) {
 162                 TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail,
 163                     TAVOR_TNF_ERROR, "");
 164                 TAVOR_TNF_EXIT(tavor_mr_register_buf);
 165                 return (status);
 166         }
 167 
 168         TAVOR_TNF_EXIT(tavor_mr_register_buf);
 169         return (DDI_SUCCESS);
 170 }
 171 
 172 
 173 /*
 174  * tavor_mr_register_shared()
 175  *    Context: Can be called from interrupt or base context.
 176  */
 177 int
 178 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
 179     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
 180 {
 181         tavor_rsrc_pool_info_t  *rsrc_pool;
 182         tavor_rsrc_t            *mpt, *mtt, *rsrc;
 183         tavor_umap_db_entry_t   *umapdb;
 184         tavor_hw_mpt_t          mpt_entry;
 185         tavor_mrhdl_t           mr;
 186         tavor_bind_info_t       *bind;
 187         ddi_umem_cookie_t       umem_cookie;
 188         size_t                  umem_len;
 189         caddr_t                 umem_addr;
 190         uint64_t                mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
 191         uint_t                  sleep, mr_is_umem;
 192         int                     status, umem_flags;
 193         char                    *errormsg;
 194 
 195         TAVOR_TNF_ENTER(tavor_mr_register_shared);
 196 
 197         /*
 198          * Check the sleep flag.  Ensure that it is consistent with the
 199          * current thread context (i.e. if we are currently in the interrupt
 200          * context, then we shouldn't be attempting to sleep).
 201          */
 202         sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
 203             TAVOR_SLEEP;
 204         if ((sleep == TAVOR_SLEEP) &&
 205             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
 206                 /* Set "status" and "errormsg" and goto failure */
 207                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
 208                 goto mrshared_fail;
 209         }
 210 
 211         /* Increment the reference count on the protection domain (PD) */
 212         tavor_pd_refcnt_inc(pd);
 213 
 214         /*
 215          * Allocate an MPT entry.  This will be filled in with all the
 216          * necessary parameters to define the shared memory region.
 217          * Specifically, it will be made to reference the currently existing
 218          * MTT entries and ownership of the MPT will be passed to the hardware
 219          * in the last step below.  If we fail here, we must undo the
 220          * protection domain reference count.
 221          */
 222         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
 223         if (status != DDI_SUCCESS) {
 224                 /* Set "status" and "errormsg" and goto failure */
 225                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
 226                 goto mrshared_fail1;
 227         }
 228 
 229         /*
 230          * Allocate the software structure for tracking the shared memory
 231          * region (i.e. the Tavor Memory Region handle).  If we fail here, we
 232          * must undo the protection domain reference count and the previous
 233          * resource allocation.
 234          */
 235         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
 236         if (status != DDI_SUCCESS) {
 237                 /* Set "status" and "errormsg" and goto failure */
 238                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
 239                 goto mrshared_fail2;
 240         }
 241         mr = (tavor_mrhdl_t)rsrc->tr_addr;
 242 
 243         /*
 244          * Setup and validate the memory region access flags.  This means
 245          * translating the IBTF's enable flags into the access flags that
 246          * will be used in later operations.
 247          */
 248         mr->mr_accflag = 0;
 249         if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
 250                 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
 251         if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
 252                 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
 253         if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
 254                 mr->mr_accflag |= IBT_MR_REMOTE_READ;
 255         if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
 256                 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
 257         if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
 258                 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
 259 
 260         /*
 261          * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
 262          * from a certain number of "constrained" bits (the least significant
 263          * bits) and some number of "unconstrained" bits.  The constrained
 264          * bits must be set to the index of the entry in the MPT table, but
 265          * the unconstrained bits can be set to any value we wish.  Note:
 266          * if no remote access is required, then the RKey value is not filled
 267          * in.  Otherwise both Rkey and LKey are given the same value.
 268          */
 269         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
 270         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
 271             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
 272             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
 273                 mr->mr_rkey = mr->mr_lkey;
 274         }
 275 
 276         /* Grab the MR lock for the current memory region */
 277         mutex_enter(&mrhdl->mr_lock);
 278 
 279         /*
 280          * Check here to see if the memory region has already been partially
 281          * deregistered as a result of a tavor_umap_umemlock_cb() callback.
 282          * If so, this is an error, return failure.
 283          */
 284         if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
 285                 mutex_exit(&mrhdl->mr_lock);
 286                 /* Set "status" and "errormsg" and goto failure */
 287                 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
 288                 goto mrshared_fail3;
 289         }
 290 
 291         /*
 292          * Determine if the original memory was from userland and, if so, pin
 293          * the pages (again) with umem_lockmemory().  This will guarantee a
 294          * separate callback for each of this shared region's MR handles.
 295          * If this is userland memory, then allocate an entry in the
 296          * "userland resources database".  This will later be added to
 297          * the database (after all further memory registration operations are
 298          * successful).  If we fail here, we must undo all the above setup.
 299          */
 300         mr_is_umem = mrhdl->mr_is_umem;
 301         if (mr_is_umem) {
 302                 umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
 303                     ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
 304                 umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
 305                     ~PAGEOFFSET);
 306                 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
 307                     DDI_UMEMLOCK_LONGTERM);
 308                 status = umem_lockmemory(umem_addr, umem_len, umem_flags,
 309                     &umem_cookie, &tavor_umem_cbops, NULL);
 310                 if (status != 0) {
 311                         mutex_exit(&mrhdl->mr_lock);
 312                         /* Set "status" and "errormsg" and goto failure */
 313                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
 314                         goto mrshared_fail3;
 315                 }
 316 
 317                 umapdb = tavor_umap_db_alloc(state->ts_instance,
 318                     (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
 319                     (uint64_t)(uintptr_t)rsrc);
 320                 if (umapdb == NULL) {
 321                         mutex_exit(&mrhdl->mr_lock);
 322                         /* Set "status" and "errormsg" and goto failure */
 323                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 324                         goto mrshared_fail4;
 325                 }
 326         }
 327 
 328         /*
 329          * Copy the MTT resource pointer (and additional parameters) from
 330          * the original Tavor Memory Region handle.  Note: this is normally
 331          * where the tavor_mr_mem_bind() routine would be called, but because
 332          * we already have bound and filled-in MTT entries it is simply a
 333          * matter here of managing the MTT reference count and grabbing the
 334          * address of the MTT table entries (for filling in the shared region's
 335          * MPT entry).
 336          */
 337         mr->mr_mttrsrcp        = mrhdl->mr_mttrsrcp;
 338         mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
 339         mr->mr_bindinfo        = mrhdl->mr_bindinfo;
 340         mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
 341         mutex_exit(&mrhdl->mr_lock);
 342         bind = &mr->mr_bindinfo;
 343         mtt = mr->mr_mttrsrcp;
 344 
 345         /*
 346          * Increment the MTT reference count (to reflect the fact that
 347          * the MTT is now shared)
 348          */
 349         (void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
 350 
 351         /*
 352          * Update the new "bind" virtual address.  Do some extra work here
 353          * to ensure proper alignment.  That is, make sure that the page
 354          * offset for the beginning of the old range is the same as the
 355          * offset for this new mapping
 356          */
 357         pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
 358         bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
 359             (mr->mr_bindinfo.bi_addr & pgsize_msk));
 360 
 361         /*
 362          * Get the base address for the MTT table.  This will be necessary
 363          * in the next step when we are setting up the MPT entry.
 364          */
 365         rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
 366         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
 367 
 368         /*
 369          * Fill in the MPT entry.  This is the final step before passing
 370          * ownership of the MPT entry to the Tavor hardware.  We use all of
 371          * the information collected/calculated above to fill in the
 372          * requisite portions of the MPT.
 373          */
 374         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
 375         mpt_entry.m_io    = TAVOR_MEM_CYCLE_GENERATE;
 376         mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
 377         mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
 378         mpt_entry.rw      = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
 379         mpt_entry.rr      = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
 380         mpt_entry.lw      = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
 381         mpt_entry.lr      = 1;
 382         mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
 383         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
 384         mpt_entry.mem_key       = mr->mr_lkey;
 385         mpt_entry.pd            = pd->pd_pdnum;
 386         mpt_entry.start_addr    = bind->bi_addr;
 387         mpt_entry.reg_win_len   = bind->bi_len;
 388         mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
 389         mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
 390         mpt_entry.mttseg_addr_h = mtt_addr >> 32;
 391         mpt_entry.mttseg_addr_l = mtt_addr >> 6;
 392 
 393         /*
 394          * Write the MPT entry to hardware.  Lastly, we pass ownership of
 395          * the entry to the hardware.  Note: in general, this operation
 396          * shouldn't fail.  But if it does, we have to undo everything we've
 397          * done above before returning error.
 398          */
 399         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
 400             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
 401         if (status != TAVOR_CMD_SUCCESS) {
 402                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
 403                     status);
 404                 TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
 405                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 406                 /* Set "status" and "errormsg" and goto failure */
 407                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 408                     "tavor SW2HW_MPT command");
 409                 goto mrshared_fail5;
 410         }
 411 
 412         /*
 413          * Fill in the rest of the Tavor Memory Region handle.  Having
 414          * successfully transferred ownership of the MPT, we can update the
 415          * following fields for use in further operations on the MR.
 416          */
 417         mr->mr_mptrsrcp        = mpt;
 418         mr->mr_mttrsrcp        = mtt;
 419         mr->mr_pdhdl   = pd;
 420         mr->mr_rsrcp   = rsrc;
 421         mr->mr_is_umem         = mr_is_umem;
 422         mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
 423         mr->mr_umem_cbfunc = NULL;
 424         mr->mr_umem_cbarg1 = NULL;
 425         mr->mr_umem_cbarg2 = NULL;
 426 
 427         /*
 428          * If this is userland memory, then we need to insert the previously
 429          * allocated entry into the "userland resources database".  This will
 430          * allow for later coordination between the tavor_umap_umemlock_cb()
 431          * callback and tavor_mr_deregister().
 432          */
 433         if (mr_is_umem) {
 434                 tavor_umap_db_add(umapdb);
 435         }
 436 
 437         *mrhdl_new = mr;
 438 
 439         TAVOR_TNF_EXIT(tavor_mr_register_shared);
 440         return (DDI_SUCCESS);
 441 
 442 /*
 443  * The following is cleanup for all possible failure cases in this routine
 444  */
 445 mrshared_fail5:
 446         (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
 447         if (mr_is_umem) {
 448                 tavor_umap_db_free(umapdb);
 449         }
 450 mrshared_fail4:
 451         if (mr_is_umem) {
 452                 ddi_umem_unlock(umem_cookie);
 453         }
 454 mrshared_fail3:
 455         tavor_rsrc_free(state, &rsrc);
 456 mrshared_fail2:
 457         tavor_rsrc_free(state, &mpt);
 458 mrshared_fail1:
 459         tavor_pd_refcnt_dec(pd);
 460 mrshared_fail:
 461         TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "",
 462             tnf_string, msg, errormsg);
 463         TAVOR_TNF_EXIT(tavor_mr_register_shared);
 464         return (status);
 465 }
 466 
 467 
 468 /*
 469  * tavor_mr_deregister()
 470  *    Context: Can be called from interrupt or base context.
 471  */
 472 /* ARGSUSED */
 473 int
 474 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
 475     uint_t sleep)
 476 {
 477         tavor_rsrc_t            *mpt, *mtt, *rsrc, *mtt_refcnt;
 478         tavor_umap_db_entry_t   *umapdb;
 479         tavor_pdhdl_t           pd;
 480         tavor_mrhdl_t           mr;
 481         tavor_bind_info_t       *bind;
 482         uint64_t                value;
 483         int                     status, shared_mtt;
 484         char                    *errormsg;
 485 
 486         TAVOR_TNF_ENTER(tavor_mr_deregister);
 487 
 488         /*
 489          * Check the sleep flag.  Ensure that it is consistent with the
 490          * current thread context (i.e. if we are currently in the interrupt
 491          * context, then we shouldn't be attempting to sleep).
 492          */
 493         if ((sleep == TAVOR_SLEEP) &&
 494             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
 495                 /* Set "status" and "errormsg" and goto failure */
 496                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
 497                 TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "",
 498                     tnf_string, msg, errormsg);
 499                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 500                 return (status);
 501         }
 502 
 503         /*
 504          * Pull all the necessary information from the Tavor Memory Region
 505          * handle.  This is necessary here because the resource for the
 506          * MR handle is going to be freed up as part of the this
 507          * deregistration
 508          */
 509         mr      = *mrhdl;
 510         mutex_enter(&mr->mr_lock);
 511         mpt     = mr->mr_mptrsrcp;
 512         mtt     = mr->mr_mttrsrcp;
 513         mtt_refcnt = mr->mr_mttrefcntp;
 514         rsrc    = mr->mr_rsrcp;
 515         pd      = mr->mr_pdhdl;
 516         bind    = &mr->mr_bindinfo;
 517 
 518         /*
 519          * Check here to see if the memory region has already been partially
 520          * deregistered as a result of the tavor_umap_umemlock_cb() callback.
 521          * If so, then jump to the end and free the remaining resources.
 522          */
 523         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
 524                 goto mrdereg_finish_cleanup;
 525         }
 526 
 527         /*
 528          * We must drop the "mr_lock" here to ensure that both SLEEP and
 529          * NOSLEEP calls into the firmware work as expected.  Also, if two
 530          * threads are attemping to access this MR (via de-register,
 531          * re-register, or otherwise), then we allow the firmware to enforce
 532          * the checking, that only one deregister is valid.
 533          */
 534         mutex_exit(&mr->mr_lock);
 535 
 536         /*
 537          * Reclaim MPT entry from hardware (if necessary).  Since the
 538          * tavor_mr_deregister() routine is used in the memory region
 539          * reregistration process as well, it is possible that we will
 540          * not always wish to reclaim ownership of the MPT.  Check the
 541          * "level" arg and, if necessary, attempt to reclaim it.  If
 542          * the ownership transfer fails for any reason, we check to see
 543          * what command status was returned from the hardware.  The only
 544          * "expected" error status is the one that indicates an attempt to
 545          * deregister a memory region that has memory windows bound to it
 546          */
 547         if (level >= TAVOR_MR_DEREG_ALL) {
 548                 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
 549                     NULL, 0, mpt->tr_indx, sleep);
 550                 if (status != TAVOR_CMD_SUCCESS) {
 551                         if (status == TAVOR_CMD_REG_BOUND) {
 552                                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 553                                 return (IBT_MR_IN_USE);
 554                         } else {
 555                                 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
 556                                     "failed: %08x\n", status);
 557                                 TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail,
 558                                     TAVOR_TNF_ERROR, "", tnf_uint, status,
 559                                     status);
 560                                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 561                                 return (IBT_INVALID_PARAM);
 562                         }
 563                 }
 564         }
 565 
 566         /*
 567          * Re-grab the mr_lock here.  Since further access to the protected
 568          * 'mr' structure is needed, and we would have returned previously for
 569          * the multiple deregistration case, we can safely grab the lock here.
 570          */
 571         mutex_enter(&mr->mr_lock);
 572 
 573         /*
 574          * If the memory had come from userland, then we do a lookup in the
 575          * "userland resources database".  On success, we free the entry, call
 576          * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
 577          * an indication that the umem_lockmemory() callback has called
 578          * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
 579          * the "mr_umemcookie" field in the MR handle (this will be used
 580          * later to detect that only partial cleaup still remains to be done
 581          * on the MR handle).
 582          */
 583         if (mr->mr_is_umem) {
 584                 status = tavor_umap_db_find(state->ts_instance,
 585                     (uint64_t)(uintptr_t)mr->mr_umemcookie,
 586                     MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
 587                     &umapdb);
 588                 if (status == DDI_SUCCESS) {
 589                         tavor_umap_db_free(umapdb);
 590                         ddi_umem_unlock(mr->mr_umemcookie);
 591                 } else {
 592                         ddi_umem_unlock(mr->mr_umemcookie);
 593                         mr->mr_umemcookie = NULL;
 594                 }
 595         }
 596 
 597         /* mtt_refcnt is NULL in the case of tavor_dma_mr_register() */
 598         if (mtt_refcnt != NULL) {
 599                 /*
 600                  * Decrement the MTT reference count.  Since the MTT resource
 601                  * may be shared between multiple memory regions (as a result
 602                  * of a "RegisterSharedMR" verb) it is important that we not
 603                  * free up or unbind resources prematurely.  If it's not shared
 604                  * (as indicated by the return status), then free the resource.
 605                  */
 606                 shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
 607                 if (!shared_mtt) {
 608                         tavor_rsrc_free(state, &mtt_refcnt);
 609                 }
 610 
 611                 /*
 612                  * Free up the MTT entries and unbind the memory.  Here,
 613                  * as above, we attempt to free these resources only if
 614                  * it is appropriate to do so.
 615                  */
 616                 if (!shared_mtt) {
 617                         if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
 618                                 tavor_mr_mem_unbind(state, bind);
 619                         }
 620                         tavor_rsrc_free(state, &mtt);
 621                 }
 622         }
 623 
 624         /*
 625          * If the MR handle has been invalidated, then drop the
 626          * lock and return success.  Note: This only happens because
 627          * the umem_lockmemory() callback has been triggered.  The
 628          * cleanup here is partial, and further cleanup (in a
 629          * subsequent tavor_mr_deregister() call) will be necessary.
 630          */
 631         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
 632                 mutex_exit(&mr->mr_lock);
 633                 TAVOR_TNF_EXIT(tavor_mr_deregister);
 634                 return (DDI_SUCCESS);
 635         }
 636 
 637 mrdereg_finish_cleanup:
 638         mutex_exit(&mr->mr_lock);
 639 
 640         /* Free the Tavor Memory Region handle */
 641         tavor_rsrc_free(state, &rsrc);
 642 
 643         /* Free up the MPT entry resource */
 644         tavor_rsrc_free(state, &mpt);
 645 
 646         /* Decrement the reference count on the protection domain (PD) */
 647         tavor_pd_refcnt_dec(pd);
 648 
 649         /* Set the mrhdl pointer to NULL and return success */
 650         *mrhdl = NULL;
 651 
 652         TAVOR_TNF_EXIT(tavor_mr_deregister);
 653         return (DDI_SUCCESS);
 654 }
 655 
 656 
 657 /*
 658  * tavor_mr_query()
 659  *    Context: Can be called from interrupt or base context.
 660  */
 661 /* ARGSUSED */
 662 int
 663 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
 664     ibt_mr_query_attr_t *attr)
 665 {
 666         TAVOR_TNF_ENTER(tavor_mr_query);
 667 
 668         mutex_enter(&mr->mr_lock);
 669 
 670         /*
 671          * Check here to see if the memory region has already been partially
 672          * deregistered as a result of a tavor_umap_umemlock_cb() callback.
 673          * If so, this is an error, return failure.
 674          */
 675         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
 676                 mutex_exit(&mr->mr_lock);
 677                 TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, "");
 678                 TAVOR_TNF_EXIT(tavor_mr_query);
 679                 return (IBT_MR_HDL_INVALID);
 680         }
 681 
 682         /* Fill in the queried attributes */
 683         attr->mr_attr_flags = mr->mr_accflag;
 684         attr->mr_pd  = (ibt_pd_hdl_t)mr->mr_pdhdl;
 685 
 686         /* Fill in the "local" attributes */
 687         attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
 688         attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
 689         attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
 690 
 691         /*
 692          * Fill in the "remote" attributes (if necessary).  Note: the
 693          * remote attributes are only valid if the memory region has one
 694          * or more of the remote access flags set.
 695          */
 696         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
 697             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
 698             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
 699                 attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
 700                 attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
 701                 attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
 702         }
 703 
 704         /*
 705          * If region is mapped for streaming (i.e. noncoherent), then set sync
 706          * is required
 707          */
 708         attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
 709             IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
 710 
 711         mutex_exit(&mr->mr_lock);
 712         TAVOR_TNF_EXIT(tavor_mr_query);
 713         return (DDI_SUCCESS);
 714 }
 715 
 716 
 717 /*
 718  * tavor_mr_reregister()
 719  *    Context: Can be called from interrupt or base context.
 720  */
 721 int
 722 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
 723     tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
 724     tavor_mr_options_t *op)
 725 {
 726         tavor_bind_info_t       bind;
 727         int                     status;
 728 
 729         TAVOR_TNF_ENTER(tavor_mr_reregister);
 730 
 731         /*
 732          * Fill in the "bind" struct.  This struct provides the majority
 733          * of the information that will be used to distinguish between an
 734          * "addr" binding (as is the case here) and a "buf" binding (see
 735          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 736          * which does most of the "heavy lifting" for the Tavor memory
 737          * registration (and reregistration) routines.
 738          */
 739         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 740         bind.bi_addr  = mr_attr->mr_vaddr;
 741         bind.bi_len   = mr_attr->mr_len;
 742         bind.bi_as    = mr_attr->mr_as;
 743         bind.bi_flags = mr_attr->mr_flags;
 744         status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
 745         if (status != DDI_SUCCESS) {
 746                 TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail,
 747                     TAVOR_TNF_ERROR, "");
 748                 TAVOR_TNF_EXIT(tavor_mr_reregister);
 749                 return (status);
 750         }
 751 
 752         TAVOR_TNF_EXIT(tavor_mr_reregister);
 753         return (DDI_SUCCESS);
 754 }
 755 
 756 
 757 /*
 758  * tavor_mr_reregister_buf()
 759  *    Context: Can be called from interrupt or base context.
 760  */
 761 int
 762 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
 763     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
 764     tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
 765 {
 766         tavor_bind_info_t       bind;
 767         int                     status;
 768 
 769         TAVOR_TNF_ENTER(tavor_mr_reregister_buf);
 770 
 771         /*
 772          * Fill in the "bind" struct.  This struct provides the majority
 773          * of the information that will be used to distinguish between an
 774          * "addr" binding (see above) and a "buf" binding (as is the case
 775          * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
 776          * which does most of the "heavy lifting" for the Tavor memory
 777          * registration routines.  Note: We have chosen to provide
 778          * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
 779          * not set).  It is not critical what value we choose here as it need
 780          * only be unique for the given RKey (which will happen by default),
 781          * so the choice here is somewhat arbitrary.
 782          */
 783         bind.bi_type  = TAVOR_BINDHDL_BUF;
 784         bind.bi_buf   = buf;
 785         if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
 786                 bind.bi_addr  = mr_attr->mr_vaddr;
 787         } else {
 788                 bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
 789         }
 790         bind.bi_len   = (uint64_t)buf->b_bcount;
 791         bind.bi_flags = mr_attr->mr_flags;
 792         bind.bi_as = NULL;
 793         status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
 794         if (status != DDI_SUCCESS) {
 795                 TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail,
 796                     TAVOR_TNF_ERROR, "");
 797                 TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
 798                 return (status);
 799         }
 800 
 801         TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
 802         return (DDI_SUCCESS);
 803 }
 804 
 805 
 806 /*
 807  * tavor_mr_sync()
 808  *    Context: Can be called from interrupt or base context.
 809  */
 810 /* ARGSUSED */
 811 int
 812 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
 813 {
 814         tavor_mrhdl_t           mrhdl;
 815         uint64_t                seg_vaddr, seg_len, seg_end;
 816         uint64_t                mr_start, mr_end;
 817         uint_t                  type;
 818         int                     status, i;
 819         char                    *errormsg;
 820 
 821         TAVOR_TNF_ENTER(tavor_mr_sync);
 822 
 823         /* Process each of the ibt_mr_sync_t's */
 824         for (i = 0; i < num_segs; i++) {
 825                 mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
 826 
 827                 /* Check for valid memory region handle */
 828                 if (mrhdl == NULL) {
 829                         /* Set "status" and "errormsg" and goto failure */
 830                         TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
 831                         goto mrsync_fail;
 832                 }
 833 
 834                 mutex_enter(&mrhdl->mr_lock);
 835 
 836                 /*
 837                  * Check here to see if the memory region has already been
 838                  * partially deregistered as a result of a
 839                  * tavor_umap_umemlock_cb() callback.  If so, this is an
 840                  * error, return failure.
 841                  */
 842                 if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
 843                         mutex_exit(&mrhdl->mr_lock);
 844                         /* Set "status" and "errormsg" and goto failure */
 845                         TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2");
 846                         goto mrsync_fail;
 847                 }
 848 
 849                 /* Check for valid bounds on sync request */
 850                 seg_vaddr = mr_segs[i].ms_vaddr;
 851                 seg_len   = mr_segs[i].ms_len;
 852                 seg_end   = seg_vaddr + seg_len - 1;
 853                 mr_start  = mrhdl->mr_bindinfo.bi_addr;
 854                 mr_end    = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
 855                 if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
 856                         mutex_exit(&mrhdl->mr_lock);
 857                         /* Set "status" and "errormsg" and goto failure */
 858                         TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr");
 859                         goto mrsync_fail;
 860                 }
 861                 if ((seg_end < mr_start) || (seg_end > mr_end)) {
 862                         mutex_exit(&mrhdl->mr_lock);
 863                         /* Set "status" and "errormsg" and goto failure */
 864                         TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
 865                         goto mrsync_fail;
 866                 }
 867 
 868                 /* Determine what type (i.e. direction) for sync */
 869                 if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
 870                         type = DDI_DMA_SYNC_FORDEV;
 871                 } else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
 872                         type = DDI_DMA_SYNC_FORCPU;
 873                 } else {
 874                         mutex_exit(&mrhdl->mr_lock);
 875                         /* Set "status" and "errormsg" and goto failure */
 876                         TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type");
 877                         goto mrsync_fail;
 878                 }
 879 
 880                 (void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
 881                     (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
 882                 mutex_exit(&mrhdl->mr_lock);
 883         }
 884 
 885         TAVOR_TNF_EXIT(tavor_mr_sync);
 886         return (DDI_SUCCESS);
 887 
 888 mrsync_fail:
 889         TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg,
 890             errormsg);
 891         TAVOR_TNF_EXIT(tavor_mr_sync);
 892         return (status);
 893 }
 894 
 895 
 896 /*
 897  * tavor_mw_alloc()
 898  *    Context: Can be called from interrupt or base context.
 899  */
 900 int
 901 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
 902     tavor_mwhdl_t *mwhdl)
 903 {
 904         tavor_rsrc_t            *mpt, *rsrc;
 905         tavor_hw_mpt_t          mpt_entry;
 906         tavor_mwhdl_t           mw;
 907         uint_t                  sleep;
 908         int                     status;
 909         char                    *errormsg;
 910 
 911         TAVOR_TNF_ENTER(tavor_mw_alloc);
 912 
 913         /*
 914          * Check the sleep flag.  Ensure that it is consistent with the
 915          * current thread context (i.e. if we are currently in the interrupt
 916          * context, then we shouldn't be attempting to sleep).
 917          */
 918         sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
 919         if ((sleep == TAVOR_SLEEP) &&
 920             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
 921                 /* Set "status" and "errormsg" and goto failure */
 922                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
 923                 goto mwalloc_fail;
 924         }
 925 
 926         /* Increment the reference count on the protection domain (PD) */
 927         tavor_pd_refcnt_inc(pd);
 928 
 929         /*
 930          * Allocate an MPT entry (for use as a memory window).  Since the
 931          * Tavor hardware uses the MPT entry for memory regions and for
 932          * memory windows, we will fill in this MPT with all the necessary
 933          * parameters for the memory window.  And then (just as we do for
 934          * memory regions) ownership will be passed to the hardware in the
 935          * final step below.  If we fail here, we must undo the protection
 936          * domain reference count.
 937          */
 938         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
 939         if (status != DDI_SUCCESS) {
 940                 /* Set "status" and "errormsg" and goto failure */
 941                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
 942                 goto mwalloc_fail1;
 943         }
 944 
 945         /*
 946          * Allocate the software structure for tracking the memory window (i.e.
 947          * the Tavor Memory Window handle).  Note: This is actually the same
 948          * software structure used for tracking memory regions, but since many
 949          * of the same properties are needed, only a single structure is
 950          * necessary.  If we fail here, we must undo the protection domain
 951          * reference count and the previous resource allocation.
 952          */
 953         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
 954         if (status != DDI_SUCCESS) {
 955                 /* Set "status" and "errormsg" and goto failure */
 956                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
 957                 goto mwalloc_fail2;
 958         }
 959         mw = (tavor_mwhdl_t)rsrc->tr_addr;
 960 
 961         /*
 962          * Calculate an "unbound" RKey from MPT index.  In much the same way
 963          * as we do for memory regions (above), this key is constructed from
 964          * a "constrained" (which depends on the MPT index) and an
 965          * "unconstrained" portion (which may be arbitrarily chosen).
 966          */
 967         tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
 968 
 969         /*
 970          * Fill in the MPT entry.  This is the final step before passing
 971          * ownership of the MPT entry to the Tavor hardware.  We use all of
 972          * the information collected/calculated above to fill in the
 973          * requisite portions of the MPT.  Note: fewer entries in the MPT
 974          * entry are necessary to allocate a memory window.
 975          */
 976         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
 977         mpt_entry.reg_win       = TAVOR_MPT_IS_WINDOW;
 978         mpt_entry.mem_key       = mw->mr_rkey;
 979         mpt_entry.pd            = pd->pd_pdnum;
 980 
 981         /*
 982          * Write the MPT entry to hardware.  Lastly, we pass ownership of
 983          * the entry to the hardware.  Note: in general, this operation
 984          * shouldn't fail.  But if it does, we have to undo everything we've
 985          * done above before returning error.
 986          */
 987         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
 988             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
 989         if (status != TAVOR_CMD_SUCCESS) {
 990                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
 991                     status);
 992                 TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail,
 993                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 994                 /* Set "status" and "errormsg" and goto failure */
 995                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 996                     "tavor SW2HW_MPT command");
 997                 goto mwalloc_fail3;
 998         }
 999 
1000         /*
1001          * Fill in the rest of the Tavor Memory Window handle.  Having
1002          * successfully transferred ownership of the MPT, we can update the
1003          * following fields for use in further operations on the MW.
1004          */
1005         mw->mr_mptrsrcp      = mpt;
1006         mw->mr_pdhdl = pd;
1007         mw->mr_rsrcp = rsrc;
1008         *mwhdl = mw;
1009 
1010         TAVOR_TNF_EXIT(tavor_mw_alloc);
1011         return (DDI_SUCCESS);
1012 
1013 mwalloc_fail3:
1014         tavor_rsrc_free(state, &rsrc);
1015 mwalloc_fail2:
1016         tavor_rsrc_free(state, &mpt);
1017 mwalloc_fail1:
1018         tavor_pd_refcnt_dec(pd);
1019 mwalloc_fail:
1020         TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "",
1021             tnf_string, msg, errormsg);
1022         TAVOR_TNF_EXIT(tavor_mw_alloc);
1023         return (status);
1024 }
1025 
1026 
1027 /*
1028  * tavor_mw_free()
1029  *    Context: Can be called from interrupt or base context.
1030  */
1031 int
1032 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
1033 {
1034         tavor_rsrc_t            *mpt, *rsrc;
1035         tavor_mwhdl_t           mw;
1036         int                     status;
1037         char                    *errormsg;
1038         tavor_pdhdl_t           pd;
1039 
1040         TAVOR_TNF_ENTER(tavor_mw_free);
1041 
1042         /*
1043          * Check the sleep flag.  Ensure that it is consistent with the
1044          * current thread context (i.e. if we are currently in the interrupt
1045          * context, then we shouldn't be attempting to sleep).
1046          */
1047         if ((sleep == TAVOR_SLEEP) &&
1048             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1049                 /* Set "status" and "errormsg" and goto failure */
1050                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
1051                 TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "",
1052                     tnf_string, msg, errormsg);
1053                 TAVOR_TNF_EXIT(tavor_mw_free);
1054                 return (status);
1055         }
1056 
1057         /*
1058          * Pull all the necessary information from the Tavor Memory Window
1059          * handle.  This is necessary here because the resource for the
1060          * MW handle is going to be freed up as part of the this operation.
1061          */
1062         mw      = *mwhdl;
1063         mutex_enter(&mw->mr_lock);
1064         mpt     = mw->mr_mptrsrcp;
1065         rsrc    = mw->mr_rsrcp;
1066         pd      = mw->mr_pdhdl;
1067         mutex_exit(&mw->mr_lock);
1068 
1069         /*
1070          * Reclaim the MPT entry from hardware.  Note: in general, it is
1071          * unexpected for this operation to return an error.
1072          */
1073         status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1074             0, mpt->tr_indx, sleep);
1075         if (status != TAVOR_CMD_SUCCESS) {
1076                 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
1077                     status);
1078                 TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "",
1079                     tnf_uint, status, status);
1080                 TAVOR_TNF_EXIT(tavor_mw_free);
1081                 return (IBT_INVALID_PARAM);
1082         }
1083 
1084         /* Free the Tavor Memory Window handle */
1085         tavor_rsrc_free(state, &rsrc);
1086 
1087         /* Free up the MPT entry resource */
1088         tavor_rsrc_free(state, &mpt);
1089 
1090         /* Decrement the reference count on the protection domain (PD) */
1091         tavor_pd_refcnt_dec(pd);
1092 
1093         /* Set the mwhdl pointer to NULL and return success */
1094         *mwhdl = NULL;
1095 
1096         TAVOR_TNF_EXIT(tavor_mw_free);
1097         return (DDI_SUCCESS);
1098 }
1099 
1100 
1101 /*
1102  * tavor_mr_keycalc()
1103  *    Context: Can be called from interrupt or base context.
1104  */
1105 void
1106 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1107 {
1108         uint32_t        tmp, log_num_mpt;
1109 
1110         /*
1111          * Generate a simple key from counter.  Note:  We increment this
1112          * static variable _intentionally_ without any kind of mutex around
1113          * it.  First, single-threading all operations through a single lock
1114          * would be a bad idea (from a performance point-of-view).  Second,
1115          * the upper "unconstrained" bits don't really have to be unique
1116          * because the lower bits are guaranteed to be (although we do make a
1117          * best effort to ensure that they are).  Third, the window for the
1118          * race (where both threads read and update the counter at the same
1119          * time) is incredibly small.
1120          * And, lastly, we'd like to make this into a "random" key XXX
1121          */
1122         log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1123         tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1124         *key = tmp | indx;
1125 }
1126 
1127 
1128 /*
1129  * tavor_mr_common_reg()
1130  *    Context: Can be called from interrupt or base context.
1131  */
1132 static int
1133 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1134     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1135 {
1136         tavor_rsrc_pool_info_t  *rsrc_pool;
1137         tavor_rsrc_t            *mpt, *mtt, *rsrc, *mtt_refcnt;
1138         tavor_umap_db_entry_t   *umapdb;
1139         tavor_sw_refcnt_t       *swrc_tmp;
1140         tavor_hw_mpt_t          mpt_entry;
1141         tavor_mrhdl_t           mr;
1142         ibt_mr_flags_t          flags;
1143         tavor_bind_info_t       *bh;
1144         ddi_dma_handle_t        bind_dmahdl;
1145         ddi_umem_cookie_t       umem_cookie;
1146         size_t                  umem_len;
1147         caddr_t                 umem_addr;
1148         uint64_t                mtt_addr, mtt_ddrbaseaddr, max_sz;
1149         uint_t                  sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1150         int                     status, umem_flags, bind_override_addr;
1151         char                    *errormsg;
1152 
1153         TAVOR_TNF_ENTER(tavor_mr_common_reg);
1154 
1155         /*
1156          * Check the "options" flag.  Currently this flag tells the driver
1157          * whether or not the region should be bound normally (i.e. with
1158          * entries written into the PCI IOMMU), whether it should be
1159          * registered to bypass the IOMMU, and whether or not the resulting
1160          * address should be "zero-based" (to aid the alignment restrictions
1161          * for QPs).
1162          */
1163         if (op == NULL) {
1164                 bind_type   = TAVOR_BINDMEM_NORMAL;
1165                 bind_dmahdl = NULL;
1166                 bind_override_addr = 0;
1167         } else {
1168                 bind_type          = op->mro_bind_type;
1169                 bind_dmahdl        = op->mro_bind_dmahdl;
1170                 bind_override_addr = op->mro_bind_override_addr;
1171         }
1172 
1173         /* Extract the flags field from the tavor_bind_info_t */
1174         flags = bind->bi_flags;
1175 
1176         /*
1177          * Check for invalid length.  Check is the length is zero or if the
1178          * length is larger than the maximum configured value.  Return error
1179          * if it is.
1180          */
1181         max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1182         if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1183                 /* Set "status" and "errormsg" and goto failure */
1184                 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1185                 goto mrcommon_fail;
1186         }
1187 
1188         /*
1189          * Check the sleep flag.  Ensure that it is consistent with the
1190          * current thread context (i.e. if we are currently in the interrupt
1191          * context, then we shouldn't be attempting to sleep).
1192          */
1193         sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1194         if ((sleep == TAVOR_SLEEP) &&
1195             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1196                 /* Set "status" and "errormsg" and goto failure */
1197                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1198                 goto mrcommon_fail;
1199         }
1200 
1201         /*
1202          * Get the base address for the MTT table.  This will be necessary
1203          * below when we are setting up the MPT entry.
1204          */
1205         rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1206         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1207 
1208         /* Increment the reference count on the protection domain (PD) */
1209         tavor_pd_refcnt_inc(pd);
1210 
1211         /*
1212          * Allocate an MPT entry.  This will be filled in with all the
1213          * necessary parameters to define the memory region.  And then
1214          * ownership will be passed to the hardware in the final step
1215          * below.  If we fail here, we must undo the protection domain
1216          * reference count.
1217          */
1218         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1219         if (status != DDI_SUCCESS) {
1220                 /* Set "status" and "errormsg" and goto failure */
1221                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1222                 goto mrcommon_fail1;
1223         }
1224 
1225         /*
1226          * Allocate the software structure for tracking the memory region (i.e.
1227          * the Tavor Memory Region handle).  If we fail here, we must undo
1228          * the protection domain reference count and the previous resource
1229          * allocation.
1230          */
1231         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1232         if (status != DDI_SUCCESS) {
1233                 /* Set "status" and "errormsg" and goto failure */
1234                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1235                 goto mrcommon_fail2;
1236         }
1237         mr = (tavor_mrhdl_t)rsrc->tr_addr;
1238 
1239         /*
1240          * Setup and validate the memory region access flags.  This means
1241          * translating the IBTF's enable flags into the access flags that
1242          * will be used in later operations.
1243          */
1244         mr->mr_accflag = 0;
1245         if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1246                 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1247         if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1248                 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1249         if (flags & IBT_MR_ENABLE_REMOTE_READ)
1250                 mr->mr_accflag |= IBT_MR_REMOTE_READ;
1251         if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1252                 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1253         if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1254                 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1255 
1256         /*
1257          * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1258          * from a certain number of "constrained" bits (the least significant
1259          * bits) and some number of "unconstrained" bits.  The constrained
1260          * bits must be set to the index of the entry in the MPT table, but
1261          * the unconstrained bits can be set to any value we wish.  Note:
1262          * if no remote access is required, then the RKey value is not filled
1263          * in.  Otherwise both Rkey and LKey are given the same value.
1264          */
1265         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1266         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1267             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1268             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1269                 mr->mr_rkey = mr->mr_lkey;
1270         }
1271 
1272         /*
1273          * Determine if the memory is from userland and pin the pages
1274          * with umem_lockmemory() if necessary.
1275          * Then, if this is userland memory, allocate an entry in the
1276          * "userland resources database".  This will later be added to
1277          * the database (after all further memory registration operations are
1278          * successful).  If we fail here, we must undo the reference counts
1279          * and the previous resource allocations.
1280          */
1281         mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1282         if (mr_is_umem) {
1283                 umem_len   = ptob(btopr(bind->bi_len +
1284                     ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1285                 umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1286                 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1287                     DDI_UMEMLOCK_LONGTERM);
1288                 status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1289                     &umem_cookie, &tavor_umem_cbops, NULL);
1290                 if (status != 0) {
1291                         /* Set "status" and "errormsg" and goto failure */
1292                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
1293                         goto mrcommon_fail3;
1294                 }
1295 
1296                 bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1297                     B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1298                 if (bind->bi_buf == NULL) {
1299                         /* Set "status" and "errormsg" and goto failure */
1300                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup");
1301                         goto mrcommon_fail3;
1302                 }
1303                 bind->bi_type = TAVOR_BINDHDL_UBUF;
1304                 bind->bi_buf->b_flags |= B_READ;
1305 
1306                 umapdb = tavor_umap_db_alloc(state->ts_instance,
1307                     (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1308                     (uint64_t)(uintptr_t)rsrc);
1309                 if (umapdb == NULL) {
1310                         /* Set "status" and "errormsg" and goto failure */
1311                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
1312                         goto mrcommon_fail4;
1313                 }
1314         }
1315 
1316         /*
1317          * Setup the bindinfo for the mtt bind call
1318          */
1319         bh = &mr->mr_bindinfo;
1320         bcopy(bind, bh, sizeof (tavor_bind_info_t));
1321         bh->bi_bypass = bind_type;
1322         status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1323             &mtt_pgsize_bits);
1324         if (status != DDI_SUCCESS) {
1325                 /* Set "status" and "errormsg" and goto failure */
1326                 TAVOR_TNF_FAIL(status, "failed mtt bind");
1327                 /*
1328                  * When mtt_bind fails, freerbuf has already been done,
1329                  * so make sure not to call it again.
1330                  */
1331                 bind->bi_type = bh->bi_type;
1332                 goto mrcommon_fail5;
1333         }
1334         mr->mr_logmttpgsz = mtt_pgsize_bits;
1335 
1336         /*
1337          * Allocate MTT reference count (to track shared memory regions).
1338          * This reference count resource may never be used on the given
1339          * memory region, but if it is ever later registered as "shared"
1340          * memory region then this resource will be necessary.  If we fail
1341          * here, we do pretty much the same as above to clean up.
1342          */
1343         status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1344             &mtt_refcnt);
1345         if (status != DDI_SUCCESS) {
1346                 /* Set "status" and "errormsg" and goto failure */
1347                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count");
1348                 goto mrcommon_fail6;
1349         }
1350         mr->mr_mttrefcntp = mtt_refcnt;
1351         swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1352         TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1353 
1354         /*
1355          * Fill in the MPT entry.  This is the final step before passing
1356          * ownership of the MPT entry to the Tavor hardware.  We use all of
1357          * the information collected/calculated above to fill in the
1358          * requisite portions of the MPT.
1359          */
1360         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1361         mpt_entry.m_io    = TAVOR_MEM_CYCLE_GENERATE;
1362         mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1363         mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1364         mpt_entry.rw      = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1365         mpt_entry.rr      = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1366         mpt_entry.lw      = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1367         mpt_entry.lr      = 1;
1368         mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1369         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
1370         mpt_entry.mem_key       = mr->mr_lkey;
1371         mpt_entry.pd            = pd->pd_pdnum;
1372         if (bind_override_addr == 0) {
1373                 mpt_entry.start_addr = bh->bi_addr;
1374         } else {
1375                 bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1376                 mpt_entry.start_addr = bh->bi_addr;
1377         }
1378         mpt_entry.reg_win_len   = bh->bi_len;
1379         mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1380         mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1381         mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1382         mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1383 
1384         /*
1385          * Write the MPT entry to hardware.  Lastly, we pass ownership of
1386          * the entry to the hardware.  Note: in general, this operation
1387          * shouldn't fail.  But if it does, we have to undo everything we've
1388          * done above before returning error.
1389          */
1390         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1391             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1392         if (status != TAVOR_CMD_SUCCESS) {
1393                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1394                     status);
1395                 TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
1396                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1397                 /* Set "status" and "errormsg" and goto failure */
1398                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1399                     "tavor SW2HW_MPT command");
1400                 goto mrcommon_fail7;
1401         }
1402 
1403         /*
1404          * Fill in the rest of the Tavor Memory Region handle.  Having
1405          * successfully transferred ownership of the MPT, we can update the
1406          * following fields for use in further operations on the MR.
1407          */
1408         mr->mr_mptrsrcp        = mpt;
1409         mr->mr_mttrsrcp        = mtt;
1410         mr->mr_pdhdl   = pd;
1411         mr->mr_rsrcp   = rsrc;
1412         mr->mr_is_umem         = mr_is_umem;
1413         mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1414         mr->mr_umem_cbfunc = NULL;
1415         mr->mr_umem_cbarg1 = NULL;
1416         mr->mr_umem_cbarg2 = NULL;
1417 
1418         /*
1419          * If this is userland memory, then we need to insert the previously
1420          * allocated entry into the "userland resources database".  This will
1421          * allow for later coordination between the tavor_umap_umemlock_cb()
1422          * callback and tavor_mr_deregister().
1423          */
1424         if (mr_is_umem) {
1425                 tavor_umap_db_add(umapdb);
1426         }
1427 
1428         *mrhdl = mr;
1429 
1430         TAVOR_TNF_EXIT(tavor_mr_common_reg);
1431         return (DDI_SUCCESS);
1432 
1433 /*
1434  * The following is cleanup for all possible failure cases in this routine
1435  */
1436 mrcommon_fail7:
1437         tavor_rsrc_free(state, &mtt_refcnt);
1438 mrcommon_fail6:
1439         tavor_rsrc_free(state, &mtt);
1440         tavor_mr_mem_unbind(state, bh);
1441         bind->bi_type = bh->bi_type;
1442 mrcommon_fail5:
1443         if (mr_is_umem) {
1444                 tavor_umap_db_free(umapdb);
1445         }
1446 mrcommon_fail4:
1447         if (mr_is_umem) {
1448                 /*
1449                  * Free up the memory ddi_umem_iosetup() allocates
1450                  * internally.
1451                  */
1452                 if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1453                         freerbuf(bind->bi_buf);
1454                         bind->bi_type = TAVOR_BINDHDL_NONE;
1455                 }
1456                 ddi_umem_unlock(umem_cookie);
1457         }
1458 mrcommon_fail3:
1459         tavor_rsrc_free(state, &rsrc);
1460 mrcommon_fail2:
1461         tavor_rsrc_free(state, &mpt);
1462 mrcommon_fail1:
1463         tavor_pd_refcnt_dec(pd);
1464 mrcommon_fail:
1465         TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "",
1466             tnf_string, msg, errormsg);
1467         TAVOR_TNF_EXIT(tavor_mr_common_reg);
1468         return (status);
1469 }
1470 
1471 int
1472 tavor_dma_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
1473     ibt_dmr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl)
1474 {
1475         tavor_rsrc_t            *mpt, *rsrc;
1476         tavor_hw_mpt_t          mpt_entry;
1477         tavor_mrhdl_t           mr;
1478         ibt_mr_flags_t          flags;
1479         uint_t                  sleep;
1480         int                     status;
1481 
1482         /* Extract the flags field */
1483         flags = mr_attr->dmr_flags;
1484 
1485         /*
1486          * Check the sleep flag.  Ensure that it is consistent with the
1487          * current thread context (i.e. if we are currently in the interrupt
1488          * context, then we shouldn't be attempting to sleep).
1489          */
1490         sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1491         if ((sleep == TAVOR_SLEEP) &&
1492             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1493                 status = IBT_INVALID_PARAM;
1494                 goto mrcommon_fail;
1495         }
1496 
1497         /* Increment the reference count on the protection domain (PD) */
1498         tavor_pd_refcnt_inc(pd);
1499 
1500         /*
1501          * Allocate an MPT entry.  This will be filled in with all the
1502          * necessary parameters to define the memory region.  And then
1503          * ownership will be passed to the hardware in the final step
1504          * below.  If we fail here, we must undo the protection domain
1505          * reference count.
1506          */
1507         status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1508         if (status != DDI_SUCCESS) {
1509                 status = IBT_INSUFF_RESOURCE;
1510                 goto mrcommon_fail1;
1511         }
1512 
1513         /*
1514          * Allocate the software structure for tracking the memory region (i.e.
1515          * the Tavor Memory Region handle).  If we fail here, we must undo
1516          * the protection domain reference count and the previous resource
1517          * allocation.
1518          */
1519         status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1520         if (status != DDI_SUCCESS) {
1521                 status = IBT_INSUFF_RESOURCE;
1522                 goto mrcommon_fail2;
1523         }
1524         mr = (tavor_mrhdl_t)rsrc->tr_addr;
1525         bzero(mr, sizeof (*mr));
1526 
1527         /*
1528          * Setup and validate the memory region access flags.  This means
1529          * translating the IBTF's enable flags into the access flags that
1530          * will be used in later operations.
1531          */
1532         mr->mr_accflag = 0;
1533         if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1534                 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1535         if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1536                 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1537         if (flags & IBT_MR_ENABLE_REMOTE_READ)
1538                 mr->mr_accflag |= IBT_MR_REMOTE_READ;
1539         if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1540                 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1541         if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1542                 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1543 
1544         /*
1545          * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1546          * from a certain number of "constrained" bits (the least significant
1547          * bits) and some number of "unconstrained" bits.  The constrained
1548          * bits must be set to the index of the entry in the MPT table, but
1549          * the unconstrained bits can be set to any value we wish.  Note:
1550          * if no remote access is required, then the RKey value is not filled
1551          * in.  Otherwise both Rkey and LKey are given the same value.
1552          */
1553         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1554         if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1555             (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1556             (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1557                 mr->mr_rkey = mr->mr_lkey;
1558         }
1559 
1560         /*
1561          * Fill in the MPT entry.  This is the final step before passing
1562          * ownership of the MPT entry to the Tavor hardware.  We use all of
1563          * the information collected/calculated above to fill in the
1564          * requisite portions of the MPT.
1565          */
1566         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1567 
1568         mpt_entry.m_io    = TAVOR_MEM_CYCLE_GENERATE;
1569         mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1570         mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1571         mpt_entry.rw      = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1572         mpt_entry.rr      = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1573         mpt_entry.lw      = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1574         mpt_entry.lr      = 1;
1575         mpt_entry.phys_addr = 1;        /* critical bit for this */
1576         mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1577 
1578         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
1579         mpt_entry.mem_key       = mr->mr_lkey;
1580         mpt_entry.pd            = pd->pd_pdnum;
1581         mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1582 
1583         mpt_entry.start_addr = mr_attr->dmr_paddr;
1584         mpt_entry.reg_win_len = mr_attr->dmr_len;
1585 
1586         mpt_entry.mttseg_addr_h = 0;
1587         mpt_entry.mttseg_addr_l = 0;
1588 
1589         /*
1590          * Write the MPT entry to hardware.  Lastly, we pass ownership of
1591          * the entry to the hardware if needed.  Note: in general, this
1592          * operation shouldn't fail.  But if it does, we have to undo
1593          * everything we've done above before returning error.
1594          *
1595          * For Tavor, this routine (which is common to the contexts) will only
1596          * set the ownership if needed - the process of passing the context
1597          * itself to HW will take care of setting up the MPT (based on type
1598          * and index).
1599          */
1600 
1601         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1602             sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1603         if (status != TAVOR_CMD_SUCCESS) {
1604                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1605                     status);
1606                 status = ibc_get_ci_failure(0);
1607                 goto mrcommon_fail7;
1608         }
1609 
1610         /*
1611          * Fill in the rest of the Tavor Memory Region handle.  Having
1612          * successfully transferred ownership of the MPT, we can update the
1613          * following fields for use in further operations on the MR.
1614          */
1615         mr->mr_mptrsrcp         = mpt;
1616         mr->mr_mttrsrcp         = NULL;
1617         mr->mr_pdhdl    = pd;
1618         mr->mr_rsrcp    = rsrc;
1619         mr->mr_is_umem          = 0;
1620         mr->mr_umemcookie  = NULL;
1621         mr->mr_umem_cbfunc = NULL;
1622         mr->mr_umem_cbarg1 = NULL;
1623         mr->mr_umem_cbarg2 = NULL;
1624 
1625         *mrhdl = mr;
1626 
1627         return (DDI_SUCCESS);
1628 
1629 /*
1630  * The following is cleanup for all possible failure cases in this routine
1631  */
1632 mrcommon_fail7:
1633         tavor_rsrc_free(state, &rsrc);
1634 mrcommon_fail2:
1635         tavor_rsrc_free(state, &mpt);
1636 mrcommon_fail1:
1637         tavor_pd_refcnt_dec(pd);
1638 mrcommon_fail:
1639         return (status);
1640 }
1641 
1642 /*
1643  * tavor_mr_mtt_bind()
1644  *    Context: Can be called from interrupt or base context.
1645  */
1646 int
1647 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1648     ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1649 {
1650         uint64_t                nummtt;
1651         uint_t                  sleep;
1652         int                     status;
1653         char                    *errormsg;
1654 
1655         TAVOR_TNF_ENTER(tavor_mr_common_reg);
1656 
1657         /*
1658          * Check the sleep flag.  Ensure that it is consistent with the
1659          * current thread context (i.e. if we are currently in the interrupt
1660          * context, then we shouldn't be attempting to sleep).
1661          */
1662         sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1663         if ((sleep == TAVOR_SLEEP) &&
1664             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1665                 /* Set "status" and "errormsg" and goto failure */
1666                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1667                 goto mrmttbind_fail;
1668         }
1669 
1670         /*
1671          * Bind the memory and determine the mapped addresses.  This is
1672          * the first of two routines that do all the "heavy lifting" for
1673          * the Tavor memory registration routines.  The tavor_mr_mem_bind()
1674          * routine takes the "bind" struct with all its fields filled
1675          * in and returns a list of DMA cookies (for the PCI mapped addresses
1676          * corresponding to the specified address region) which are used by
1677          * the tavor_mr_fast_mtt_write() routine below.  If we fail here, we
1678          * must undo all the previous resource allocation (and PD reference
1679          * count).
1680          */
1681         status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1682         if (status != DDI_SUCCESS) {
1683                 /* Set "status" and "errormsg" and goto failure */
1684                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
1685                 goto mrmttbind_fail;
1686         }
1687 
1688         /*
1689          * Determine number of pages spanned.  This routine uses the
1690          * information in the "bind" struct to determine the required
1691          * number of MTT entries needed (and returns the suggested page size -
1692          * as a "power-of-2" - for each MTT entry).
1693          */
1694         nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1695 
1696         /*
1697          * Allocate the MTT entries.  Use the calculations performed above to
1698          * allocate the required number of MTT entries.  Note: MTT entries are
1699          * allocated in "MTT segments" which consist of complete cachelines
1700          * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
1701          * macro is used to do the proper conversion.  If we fail here, we
1702          * must not only undo all the previous resource allocation (and PD
1703          * reference count), but we must also unbind the memory.
1704          */
1705         status = tavor_rsrc_alloc(state, TAVOR_MTT,
1706             TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1707         if (status != DDI_SUCCESS) {
1708                 /* Set "status" and "errormsg" and goto failure */
1709                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
1710                 goto mrmttbind_fail2;
1711         }
1712 
1713         /*
1714          * Write the mapped addresses into the MTT entries.  This is part two
1715          * of the "heavy lifting" routines that we talked about above.  Note:
1716          * we pass the suggested page size from the earlier operation here.
1717          * And if we fail here, we again do pretty much the same huge clean up.
1718          */
1719         status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
1720         if (status != DDI_SUCCESS) {
1721                 /* Set "status" and "errormsg" and goto failure */
1722                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
1723                 goto mrmttbind_fail3;
1724         }
1725         TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1726         return (DDI_SUCCESS);
1727 
1728 /*
1729  * The following is cleanup for all possible failure cases in this routine
1730  */
1731 mrmttbind_fail3:
1732         tavor_rsrc_free(state, mtt);
1733 mrmttbind_fail2:
1734         tavor_mr_mem_unbind(state, bind);
1735 mrmttbind_fail:
1736         TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "",
1737             tnf_string, msg, errormsg);
1738         TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1739         return (status);
1740 }
1741 
1742 
1743 /*
1744  * tavor_mr_mtt_unbind()
1745  *    Context: Can be called from interrupt or base context.
1746  */
1747 int
1748 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
1749     tavor_rsrc_t *mtt)
1750 {
1751         TAVOR_TNF_ENTER(tavor_mr_mtt_unbind);
1752 
1753         /*
1754          * Free up the MTT entries and unbind the memory.  Here, as above, we
1755          * attempt to free these resources only if it is appropriate to do so.
1756          */
1757         tavor_mr_mem_unbind(state, bind);
1758         tavor_rsrc_free(state, &mtt);
1759 
1760         TAVOR_TNF_EXIT(tavor_mr_mtt_unbind);
1761         return (DDI_SUCCESS);
1762 }
1763 
1764 
1765 /*
1766  * tavor_mr_common_rereg()
1767  *    Context: Can be called from interrupt or base context.
1768  */
1769 static int
1770 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
1771     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
1772     tavor_mr_options_t *op)
1773 {
1774         tavor_rsrc_t            *mpt;
1775         ibt_mr_attr_flags_t     acc_flags_to_use;
1776         ibt_mr_flags_t          flags;
1777         tavor_pdhdl_t           pd_to_use;
1778         tavor_hw_mpt_t          mpt_entry;
1779         uint64_t                mtt_addr_to_use, vaddr_to_use, len_to_use;
1780         uint_t                  sleep, dereg_level;
1781         int                     status;
1782         char                    *errormsg;
1783 
1784         TAVOR_TNF_ENTER(tavor_mr_common_rereg);
1785 
1786         /*
1787          * Check here to see if the memory region corresponds to a userland
1788          * mapping.  Reregistration of userland memory regions is not
1789          * currently supported.  Return failure. XXX
1790          */
1791         if (mr->mr_is_umem) {
1792                 /* Set "status" and "errormsg" and goto failure */
1793                 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
1794                 goto mrrereg_fail;
1795         }
1796 
1797         mutex_enter(&mr->mr_lock);
1798 
1799         /* Pull MPT resource pointer from the Tavor Memory Region handle */
1800         mpt = mr->mr_mptrsrcp;
1801 
1802         /* Extract the flags field from the tavor_bind_info_t */
1803         flags = bind->bi_flags;
1804 
1805         /*
1806          * Check the sleep flag.  Ensure that it is consistent with the
1807          * current thread context (i.e. if we are currently in the interrupt
1808          * context, then we shouldn't be attempting to sleep).
1809          */
1810         sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1811         if ((sleep == TAVOR_SLEEP) &&
1812             (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1813                 mutex_exit(&mr->mr_lock);
1814                 /* Set "status" and "errormsg" and goto failure */
1815                 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1816                 goto mrrereg_fail;
1817         }
1818 
1819         /*
1820          * First step is to temporarily invalidate the MPT entry.  This
1821          * regains ownership from the hardware, and gives us the opportunity
1822          * to modify the entry.  Note: The HW2SW_MPT command returns the
1823          * current MPT entry contents.  These are saved away here because
1824          * they will be reused in a later step below.  If the region has
1825          * bound memory windows that we fail returning an "in use" error code.
1826          * Otherwise, this is an unexpected error and we deregister the
1827          * memory region and return error.
1828          *
1829          * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1830          * against holding the lock around this rereg call in all contexts.
1831          */
1832         status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
1833             sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1834         if (status != TAVOR_CMD_SUCCESS) {
1835                 mutex_exit(&mr->mr_lock);
1836                 if (status == TAVOR_CMD_REG_BOUND) {
1837                         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1838                         return (IBT_MR_IN_USE);
1839                 } else {
1840                         cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
1841                             "%08x\n", status);
1842 
1843                         /*
1844                          * Call deregister and ensure that all current
1845                          * resources get freed up
1846                          */
1847                         if (tavor_mr_deregister(state, &mr,
1848                             TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
1849                                 TAVOR_WARNING(state, "failed to deregister "
1850                                     "memory region");
1851                         }
1852                         TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail,
1853                             TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1854                         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1855                         return (ibc_get_ci_failure(0));
1856                 }
1857         }
1858 
1859         /*
1860          * If we're changing the protection domain, then validate the new one
1861          */
1862         if (flags & IBT_MR_CHANGE_PD) {
1863 
1864                 /* Check for valid PD handle pointer */
1865                 if (pd == NULL) {
1866                         mutex_exit(&mr->mr_lock);
1867                         /*
1868                          * Call deregister and ensure that all current
1869                          * resources get properly freed up. Unnecessary
1870                          * here to attempt to regain software ownership
1871                          * of the MPT entry as that has already been
1872                          * done above.
1873                          */
1874                         if (tavor_mr_deregister(state, &mr,
1875                             TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1876                             DDI_SUCCESS) {
1877                                 TAVOR_WARNING(state, "failed to deregister "
1878                                     "memory region");
1879                         }
1880                         /* Set "status" and "errormsg" and goto failure */
1881                         TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
1882                         goto mrrereg_fail;
1883                 }
1884 
1885                 /* Use the new PD handle in all operations below */
1886                 pd_to_use = pd;
1887 
1888         } else {
1889                 /* Use the current PD handle in all operations below */
1890                 pd_to_use = mr->mr_pdhdl;
1891         }
1892 
1893         /*
1894          * If we're changing access permissions, then validate the new ones
1895          */
1896         if (flags & IBT_MR_CHANGE_ACCESS) {
1897                 /*
1898                  * Validate the access flags.  Both remote write and remote
1899                  * atomic require the local write flag to be set
1900                  */
1901                 if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
1902                     (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
1903                     !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
1904                         mutex_exit(&mr->mr_lock);
1905                         /*
1906                          * Call deregister and ensure that all current
1907                          * resources get properly freed up. Unnecessary
1908                          * here to attempt to regain software ownership
1909                          * of the MPT entry as that has already been
1910                          * done above.
1911                          */
1912                         if (tavor_mr_deregister(state, &mr,
1913                             TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1914                             DDI_SUCCESS) {
1915                                 TAVOR_WARNING(state, "failed to deregister "
1916                                     "memory region");
1917                         }
1918                         /* Set "status" and "errormsg" and goto failure */
1919                         TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID,
1920                             "invalid access flags");
1921                         goto mrrereg_fail;
1922                 }
1923 
1924                 /*
1925                  * Setup and validate the memory region access flags.  This
1926                  * means translating the IBTF's enable flags into the access
1927                  * flags that will be used in later operations.
1928                  */
1929                 acc_flags_to_use = 0;
1930                 if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1931                         acc_flags_to_use |= IBT_MR_WINDOW_BIND;
1932                 if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1933                         acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
1934                 if (flags & IBT_MR_ENABLE_REMOTE_READ)
1935                         acc_flags_to_use |= IBT_MR_REMOTE_READ;
1936                 if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1937                         acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
1938                 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1939                         acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
1940 
1941         } else {
1942                 acc_flags_to_use = mr->mr_accflag;
1943         }
1944 
1945         /*
1946          * If we're modifying the translation, then figure out whether
1947          * we can reuse the current MTT resources.  This means calling
1948          * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
1949          * for the reregistration.  If the current memory region contains
1950          * sufficient MTT entries for the new regions, then it will be
1951          * reused and filled in.  Otherwise, new entries will be allocated,
1952          * the old ones will be freed, and the new entries will be filled
1953          * in.  Note:  If we're not modifying the translation, then we
1954          * should already have all the information we need to update the MPT.
1955          * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
1956          * a "dereg_level" which is the level of cleanup that needs to be
1957          * passed to tavor_mr_deregister() to finish the cleanup.
1958          */
1959         if (flags & IBT_MR_CHANGE_TRANSLATION) {
1960                 status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
1961                     &mtt_addr_to_use, sleep, &dereg_level);
1962                 if (status != DDI_SUCCESS) {
1963                         mutex_exit(&mr->mr_lock);
1964                         /*
1965                          * Call deregister and ensure that all resources get
1966                          * properly freed up.
1967                          */
1968                         if (tavor_mr_deregister(state, &mr, dereg_level,
1969                             sleep) != DDI_SUCCESS) {
1970                                 TAVOR_WARNING(state, "failed to deregister "
1971                                     "memory region");
1972                         }
1973 
1974                         /* Set "status" and "errormsg" and goto failure */
1975                         TAVOR_TNF_FAIL(status, "failed rereg helper");
1976                         goto mrrereg_fail;
1977                 }
1978                 vaddr_to_use = mr->mr_bindinfo.bi_addr;
1979                 len_to_use   = mr->mr_bindinfo.bi_len;
1980         } else {
1981                 mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
1982                     ((uint64_t)mpt_entry.mttseg_addr_l << 6));
1983                 vaddr_to_use = mr->mr_bindinfo.bi_addr;
1984                 len_to_use   = mr->mr_bindinfo.bi_len;
1985         }
1986 
1987         /*
1988          * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
1989          * when the region was first registered, each key is formed from
1990          * "constrained" bits and "unconstrained" bits.  Note:  If no remote
1991          * access is required, then the RKey value is not filled in.  Otherwise
1992          * both Rkey and LKey are given the same value.
1993          */
1994         tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1995         if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
1996             (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
1997             (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
1998                 mr->mr_rkey = mr->mr_lkey;
1999         }
2000 
2001         /*
2002          * Update the MPT entry with the new information.  Some of this
2003          * information is retained from the previous operation, some of
2004          * it is new based on request.
2005          */
2006         mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2007         mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2008         mpt_entry.rw      = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2009         mpt_entry.rr      = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2010         mpt_entry.lw      = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2011         mpt_entry.page_sz       = mr->mr_logmttpgsz - 0xC;
2012         mpt_entry.mem_key       = mr->mr_lkey;
2013         mpt_entry.pd            = pd_to_use->pd_pdnum;
2014         mpt_entry.start_addr    = vaddr_to_use;
2015         mpt_entry.reg_win_len   = len_to_use;
2016         mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
2017         mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
2018 
2019         /*
2020          * Write the updated MPT entry to hardware
2021          *
2022          * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2023          * against holding the lock around this rereg call in all contexts.
2024          */
2025         status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2026             sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2027         if (status != TAVOR_CMD_SUCCESS) {
2028                 mutex_exit(&mr->mr_lock);
2029                 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
2030                     status);
2031                 /*
2032                  * Call deregister and ensure that all current resources get
2033                  * properly freed up. Unnecessary here to attempt to regain
2034                  * software ownership of the MPT entry as that has already
2035                  * been done above.
2036                  */
2037                 if (tavor_mr_deregister(state, &mr,
2038                     TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2039                         TAVOR_WARNING(state, "failed to deregister memory "
2040                             "region");
2041                 }
2042                 TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail,
2043                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2044                 TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2045                 return (ibc_get_ci_failure(0));
2046         }
2047 
2048         /*
2049          * If we're changing PD, then update their reference counts now.
2050          * This means decrementing the reference count on the old PD and
2051          * incrementing the reference count on the new PD.
2052          */
2053         if (flags & IBT_MR_CHANGE_PD) {
2054                 tavor_pd_refcnt_dec(mr->mr_pdhdl);
2055                 tavor_pd_refcnt_inc(pd);
2056         }
2057 
2058         /*
2059          * Update the contents of the Tavor Memory Region handle to reflect
2060          * what has been changed.
2061          */
2062         mr->mr_pdhdl   = pd_to_use;
2063         mr->mr_accflag         = acc_flags_to_use;
2064         mr->mr_is_umem         = 0;
2065         mr->mr_umemcookie = NULL;
2066 
2067         /* New MR handle is same as the old */
2068         *mrhdl_new = mr;
2069         mutex_exit(&mr->mr_lock);
2070 
2071         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2072         return (DDI_SUCCESS);
2073 
2074 mrrereg_fail:
2075         TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "",
2076             tnf_string, msg, errormsg);
2077         TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2078         return (status);
2079 }
2080 
2081 
2082 /*
2083  * tavor_mr_rereg_xlat_helper
2084  *    Context: Can be called from interrupt or base context.
2085  *    Note: This routine expects the "mr_lock" to be held when it
2086  *    is called.  Upon returning failure, this routine passes information
2087  *    about what "dereg_level" should be passed to tavor_mr_deregister().
2088  */
2089 static int
2090 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
2091     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
2092     uint_t sleep, uint_t *dereg_level)
2093 {
2094         tavor_rsrc_pool_info_t  *rsrc_pool;
2095         tavor_rsrc_t            *mtt, *mtt_refcnt;
2096         tavor_sw_refcnt_t       *swrc_old, *swrc_new;
2097         ddi_dma_handle_t        dmahdl;
2098         uint64_t                nummtt_needed, nummtt_in_currrsrc, max_sz;
2099         uint64_t                mtt_ddrbaseaddr;
2100         uint_t                  mtt_pgsize_bits, bind_type, reuse_dmahdl;
2101         int                     status;
2102         char                    *errormsg;
2103 
2104         TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper);
2105 
2106         ASSERT(MUTEX_HELD(&mr->mr_lock));
2107 
2108         /*
2109          * Check the "options" flag.  Currently this flag tells the driver
2110          * whether or not the region should be bound normally (i.e. with
2111          * entries written into the PCI IOMMU) or whether it should be
2112          * registered to bypass the IOMMU.
2113          */
2114         if (op == NULL) {
2115                 bind_type = TAVOR_BINDMEM_NORMAL;
2116         } else {
2117                 bind_type = op->mro_bind_type;
2118         }
2119 
2120         /*
2121          * Check for invalid length.  Check is the length is zero or if the
2122          * length is larger than the maximum configured value.  Return error
2123          * if it is.
2124          */
2125         max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
2126         if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2127                 /*
2128                  * Deregister will be called upon returning failure from this
2129                  * routine. This will ensure that all current resources get
2130                  * properly freed up. Unnecessary to attempt to regain
2131                  * software ownership of the MPT entry as that has already
2132                  * been done above (in tavor_mr_reregister())
2133                  */
2134                 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
2135 
2136                 /* Set "status" and "errormsg" and goto failure */
2137                 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
2138                 goto mrrereghelp_fail;
2139         }
2140 
2141         /*
2142          * Determine the number of pages necessary for new region and the
2143          * number of pages supported by the current MTT resources
2144          */
2145         nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2146         nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
2147 
2148         /*
2149          * Depending on whether we have enough pages or not, the next step is
2150          * to fill in a set of MTT entries that reflect the new mapping.  In
2151          * the first case below, we already have enough entries.  This means
2152          * we need to unbind the memory from the previous mapping, bind the
2153          * memory for the new mapping, write the new MTT entries, and update
2154          * the mr to reflect the changes.
2155          * In the second case below, we do not have enough entries in the
2156          * current mapping.  So, in this case, we need not only to unbind the
2157          * current mapping, but we need to free up the MTT resources associated
2158          * with that mapping.  After we've successfully done that, we continue
2159          * by binding the new memory, allocating new MTT entries, writing the
2160          * new MTT entries, and updating the mr to reflect the changes.
2161          */
2162 
2163         /*
2164          * If this region is being shared (i.e. MTT refcount != 1), then we
2165          * can't reuse the current MTT resources regardless of their size.
2166          * Instead we'll need to alloc new ones (below) just as if there
2167          * hadn't been enough room in the current entries.
2168          */
2169         swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
2170         if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
2171             (nummtt_needed <= nummtt_in_currrsrc)) {
2172 
2173                 /*
2174                  * Unbind the old mapping for this memory region, but retain
2175                  * the ddi_dma_handle_t (if possible) for reuse in the bind
2176                  * operation below.  Note:  If original memory region was
2177                  * bound for IOMMU bypass and the new region can not use
2178                  * bypass, then a new DMA handle will be necessary.
2179                  */
2180                 if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2181                         mr->mr_bindinfo.bi_free_dmahdl = 0;
2182                         tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2183                         dmahdl = mr->mr_bindinfo.bi_dmahdl;
2184                         reuse_dmahdl = 1;
2185                 } else {
2186                         tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2187                         dmahdl = NULL;
2188                         reuse_dmahdl = 0;
2189                 }
2190 
2191                 /*
2192                  * Bind the new memory and determine the mapped addresses.
2193                  * As described, this routine and tavor_mr_fast_mtt_write()
2194                  * do the majority of the work for the memory registration
2195                  * operations.  Note:  When we successfully finish the binding,
2196                  * we will set the "bi_free_dmahdl" flag to indicate that
2197                  * even though we may have reused the ddi_dma_handle_t we do
2198                  * wish it to be freed up at some later time.  Note also that
2199                  * if we fail, we may need to cleanup the ddi_dma_handle_t.
2200                  */
2201                 bind->bi_bypass      = bind_type;
2202                 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2203                 if (status != DDI_SUCCESS) {
2204                         if (reuse_dmahdl) {
2205                                 ddi_dma_free_handle(&dmahdl);
2206                         }
2207 
2208                         /*
2209                          * Deregister will be called upon returning failure
2210                          * from this routine. This will ensure that all
2211                          * current resources get properly freed up.
2212                          * Unnecessary to attempt to regain software ownership
2213                          * of the MPT entry as that has already been done
2214                          * above (in tavor_mr_reregister()).  Also unnecessary
2215                          * to attempt to unbind the memory.
2216                          */
2217                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2218 
2219                         /* Set "status" and "errormsg" and goto failure */
2220                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2221                         goto mrrereghelp_fail;
2222                 }
2223                 if (reuse_dmahdl) {
2224                         bind->bi_free_dmahdl = 1;
2225                 }
2226 
2227                 /*
2228                  * Using the new mapping, but reusing the current MTT
2229                  * resources, write the updated entries to MTT
2230                  */
2231                 mtt    = mr->mr_mttrsrcp;
2232                 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2233                 if (status != DDI_SUCCESS) {
2234                         /*
2235                          * Deregister will be called upon returning failure
2236                          * from this routine. This will ensure that all
2237                          * current resources get properly freed up.
2238                          * Unnecessary to attempt to regain software ownership
2239                          * of the MPT entry as that has already been done
2240                          * above (in tavor_mr_reregister()).  Also unnecessary
2241                          * to attempt to unbind the memory.
2242                          *
2243                          * But we do need to unbind the newly bound memory
2244                          * before returning.
2245                          */
2246                         tavor_mr_mem_unbind(state, bind);
2247                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2248 
2249                         /* Set "status" and "errormsg" and goto failure */
2250                         TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2251                             "failed write mtt");
2252                         goto mrrereghelp_fail;
2253                 }
2254 
2255                 /* Put the updated information into the Mem Region handle */
2256                 mr->mr_bindinfo        = *bind;
2257                 mr->mr_logmttpgsz = mtt_pgsize_bits;
2258 
2259         } else {
2260                 /*
2261                  * Check if the memory region MTT is shared by any other MRs.
2262                  * Since the resource may be shared between multiple memory
2263                  * regions (as a result of a "RegisterSharedMR()" verb) it is
2264                  * important that we not unbind any resources prematurely.
2265                  */
2266                 if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2267                         /*
2268                          * Unbind the old mapping for this memory region, but
2269                          * retain the ddi_dma_handle_t for reuse in the bind
2270                          * operation below. Note: This can only be done here
2271                          * because the region being reregistered is not
2272                          * currently shared.  Also if original memory region
2273                          * was bound for IOMMU bypass and the new region can
2274                          * not use bypass, then a new DMA handle will be
2275                          * necessary.
2276                          */
2277                         if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2278                                 mr->mr_bindinfo.bi_free_dmahdl = 0;
2279                                 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2280                                 dmahdl = mr->mr_bindinfo.bi_dmahdl;
2281                                 reuse_dmahdl = 1;
2282                         } else {
2283                                 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2284                                 dmahdl = NULL;
2285                                 reuse_dmahdl = 0;
2286                         }
2287                 } else {
2288                         dmahdl = NULL;
2289                         reuse_dmahdl = 0;
2290                 }
2291 
2292                 /*
2293                  * Bind the new memory and determine the mapped addresses.
2294                  * As described, this routine and tavor_mr_fast_mtt_write()
2295                  * do the majority of the work for the memory registration
2296                  * operations.  Note:  When we successfully finish the binding,
2297                  * we will set the "bi_free_dmahdl" flag to indicate that
2298                  * even though we may have reused the ddi_dma_handle_t we do
2299                  * wish it to be freed up at some later time.  Note also that
2300                  * if we fail, we may need to cleanup the ddi_dma_handle_t.
2301                  */
2302                 bind->bi_bypass      = bind_type;
2303                 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2304                 if (status != DDI_SUCCESS) {
2305                         if (reuse_dmahdl) {
2306                                 ddi_dma_free_handle(&dmahdl);
2307                         }
2308 
2309                         /*
2310                          * Deregister will be called upon returning failure
2311                          * from this routine. This will ensure that all
2312                          * current resources get properly freed up.
2313                          * Unnecessary to attempt to regain software ownership
2314                          * of the MPT entry as that has already been done
2315                          * above (in tavor_mr_reregister()).  Also unnecessary
2316                          * to attempt to unbind the memory.
2317                          */
2318                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2319 
2320                         /* Set "status" and "errormsg" and goto failure */
2321                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2322                         goto mrrereghelp_fail;
2323                 }
2324                 if (reuse_dmahdl) {
2325                         bind->bi_free_dmahdl = 1;
2326                 }
2327 
2328                 /*
2329                  * Allocate the new MTT entries resource
2330                  */
2331                 status = tavor_rsrc_alloc(state, TAVOR_MTT,
2332                     TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2333                 if (status != DDI_SUCCESS) {
2334                         /*
2335                          * Deregister will be called upon returning failure
2336                          * from this routine. This will ensure that all
2337                          * current resources get properly freed up.
2338                          * Unnecessary to attempt to regain software ownership
2339                          * of the MPT entry as that has already been done
2340                          * above (in tavor_mr_reregister()).  Also unnecessary
2341                          * to attempt to unbind the memory.
2342                          *
2343                          * But we do need to unbind the newly bound memory
2344                          * before returning.
2345                          */
2346                         tavor_mr_mem_unbind(state, bind);
2347                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2348 
2349                         /* Set "status" and "errormsg" and goto failure */
2350                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
2351                         goto mrrereghelp_fail;
2352                 }
2353 
2354                 /*
2355                  * Allocate MTT reference count (to track shared memory
2356                  * regions).  As mentioned elsewhere above, this reference
2357                  * count resource may never be used on the given memory region,
2358                  * but if it is ever later registered as a "shared" memory
2359                  * region then this resource will be necessary.  Note:  This
2360                  * is only necessary here if the existing memory region is
2361                  * already being shared (because otherwise we already have
2362                  * a useable reference count resource).
2363                  */
2364                 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2365                         status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2366                             sleep, &mtt_refcnt);
2367                         if (status != DDI_SUCCESS) {
2368                                 /*
2369                                  * Deregister will be called upon returning
2370                                  * failure from this routine. This will ensure
2371                                  * that all current resources get properly
2372                                  * freed up.  Unnecessary to attempt to regain
2373                                  * software ownership of the MPT entry as that
2374                                  * has already been done above (in
2375                                  * tavor_mr_reregister()).  Also unnecessary
2376                                  * to attempt to unbind the memory.
2377                                  *
2378                                  * But we need to unbind the newly bound
2379                                  * memory and free up the newly allocated MTT
2380                                  * entries before returning.
2381                                  */
2382                                 tavor_mr_mem_unbind(state, bind);
2383                                 tavor_rsrc_free(state, &mtt);
2384                                 *dereg_level =
2385                                     TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2386 
2387                                 /* Set "status"/"errormsg", goto failure */
2388                                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
2389                                     "failed reference count");
2390                                 goto mrrereghelp_fail;
2391                         }
2392                         swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2393                         TAVOR_MTT_REFCNT_INIT(swrc_new);
2394                 } else {
2395                         mtt_refcnt = mr->mr_mttrefcntp;
2396                 }
2397 
2398                 /*
2399                  * Using the new mapping and the new MTT resources, write the
2400                  * updated entries to MTT
2401                  */
2402                 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2403                 if (status != DDI_SUCCESS) {
2404                         /*
2405                          * Deregister will be called upon returning failure
2406                          * from this routine. This will ensure that all
2407                          * current resources get properly freed up.
2408                          * Unnecessary to attempt to regain software ownership
2409                          * of the MPT entry as that has already been done
2410                          * above (in tavor_mr_reregister()).  Also unnecessary
2411                          * to attempt to unbind the memory.
2412                          *
2413                          * But we need to unbind the newly bound memory,
2414                          * free up the newly allocated MTT entries, and
2415                          * (possibly) free the new MTT reference count
2416                          * resource before returning.
2417                          */
2418                         if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2419                                 tavor_rsrc_free(state, &mtt_refcnt);
2420                         }
2421                         tavor_mr_mem_unbind(state, bind);
2422                         tavor_rsrc_free(state, &mtt);
2423                         *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2424 
2425                         /* Set "status" and "errormsg" and goto failure */
2426                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt");
2427                         goto mrrereghelp_fail;
2428                 }
2429 
2430                 /*
2431                  * Check if the memory region MTT is shared by any other MRs.
2432                  * Since the resource may be shared between multiple memory
2433                  * regions (as a result of a "RegisterSharedMR()" verb) it is
2434                  * important that we not free up any resources prematurely.
2435                  */
2436                 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2437                         /* Decrement MTT reference count for "old" region */
2438                         (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2439                 } else {
2440                         /* Free up the old MTT entries resource */
2441                         tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2442                 }
2443 
2444                 /* Put the updated information into the mrhdl */
2445                 mr->mr_bindinfo        = *bind;
2446                 mr->mr_logmttpgsz = mtt_pgsize_bits;
2447                 mr->mr_mttrsrcp   = mtt;
2448                 mr->mr_mttrefcntp = mtt_refcnt;
2449         }
2450 
2451         /*
2452          * Calculate and return the updated MTT address (in the DDR address
2453          * space).  This will be used by the caller (tavor_mr_reregister) in
2454          * the updated MPT entry
2455          */
2456         rsrc_pool       = &state->ts_rsrc_hdl[TAVOR_MTT];
2457         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2458         *mtt_addr       = mtt_ddrbaseaddr + (mtt->tr_indx <<
2459             TAVOR_MTT_SIZE_SHIFT);
2460 
2461         TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2462         return (DDI_SUCCESS);
2463 
2464 mrrereghelp_fail:
2465         TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "",
2466             tnf_string, msg, errormsg);
2467         TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2468         return (status);
2469 }
2470 
2471 
2472 /*
2473  * tavor_mr_nummtt_needed()
2474  *    Context: Can be called from interrupt or base context.
2475  */
2476 /* ARGSUSED */
2477 static uint64_t
2478 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2479     uint_t *mtt_pgsize_bits)
2480 {
2481         uint64_t        pg_offset_mask;
2482         uint64_t        pg_offset, tmp_length;
2483 
2484         /*
2485          * For now we specify the page size as 8Kb (the default page size for
2486          * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2487          * size by examining the dmacookies XXX
2488          */
2489         *mtt_pgsize_bits = PAGESHIFT;
2490 
2491         pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2492         pg_offset = bind->bi_addr & pg_offset_mask;
2493         tmp_length = pg_offset + (bind->bi_len - 1);
2494         return ((tmp_length >> *mtt_pgsize_bits) + 1);
2495 }
2496 
2497 
2498 /*
2499  * tavor_mr_mem_bind()
2500  *    Context: Can be called from interrupt or base context.
2501  */
2502 static int
2503 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2504     ddi_dma_handle_t dmahdl, uint_t sleep)
2505 {
2506         ddi_dma_attr_t  dma_attr;
2507         int             (*callback)(caddr_t);
2508         uint_t          dma_xfer_mode;
2509         int             status;
2510 
2511         /* bi_type must be set to a meaningful value to get a bind handle */
2512         ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2513             bind->bi_type == TAVOR_BINDHDL_BUF ||
2514             bind->bi_type == TAVOR_BINDHDL_UBUF);
2515 
2516         TAVOR_TNF_ENTER(tavor_mr_mem_bind);
2517 
2518         /* Set the callback flag appropriately */
2519         callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2520 
2521         /* Determine whether to map STREAMING or CONSISTENT */
2522         dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2523             DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2524 
2525         /*
2526          * Initialize many of the default DMA attributes.  Then, if we're
2527          * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2528          */
2529         if (dmahdl == NULL) {
2530                 tavor_dma_attr_init(&dma_attr);
2531 #ifdef  __sparc
2532                 /*
2533                  * First, disable streaming and switch to consistent if
2534                  * configured to do so and IOMMU BYPASS is enabled.
2535                  */
2536                 if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2537                     dma_xfer_mode == DDI_DMA_STREAMING &&
2538                     bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2539                         dma_xfer_mode = DDI_DMA_CONSISTENT;
2540                 }
2541 
2542                 /*
2543                  * Then, if streaming is still specified, then "bypass" is not
2544                  * allowed.
2545                  */
2546                 if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2547                     (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2548                         dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2549                 }
2550 #endif
2551                 /* Allocate a DMA handle for the binding */
2552                 status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2553                     callback, NULL, &bind->bi_dmahdl);
2554                 if (status != DDI_SUCCESS) {
2555                         TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail,
2556                             TAVOR_TNF_ERROR, "");
2557                         TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2558                         return (status);
2559                 }
2560                 bind->bi_free_dmahdl = 1;
2561 
2562         } else  {
2563                 bind->bi_dmahdl = dmahdl;
2564                 bind->bi_free_dmahdl = 0;
2565         }
2566 
2567         /*
2568          * Bind the memory to get the PCI mapped addresses.  The decision
2569          * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2570          * is determined by the "bi_type" flag.  Note: if the bind operation
2571          * fails then we have to free up the DMA handle and return error.
2572          */
2573         if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2574                 status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2575                     (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2576                     (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2577                     &bind->bi_dmacookie, &bind->bi_cookiecnt);
2578         } else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2579                 status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2580                     bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2581                     NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2582         }
2583 
2584         if (status != DDI_DMA_MAPPED) {
2585                 if (bind->bi_free_dmahdl != 0) {
2586                         ddi_dma_free_handle(&bind->bi_dmahdl);
2587                 }
2588                 TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR,
2589                     "");
2590                 TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2591                 return (status);
2592         }
2593 
2594         TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2595         return (DDI_SUCCESS);
2596 }
2597 
2598 
2599 /*
2600  * tavor_mr_mem_unbind()
2601  *    Context: Can be called from interrupt or base context.
2602  */
2603 static void
2604 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2605 {
2606         int     status;
2607 
2608         TAVOR_TNF_ENTER(tavor_mr_mem_unbind);
2609 
2610         /*
2611          * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2612          * is actually allocated by ddi_umem_iosetup() internally, then
2613          * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2614          * not to free it again later.
2615          */
2616         if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2617                 freerbuf(bind->bi_buf);
2618                 bind->bi_type = TAVOR_BINDHDL_NONE;
2619         }
2620 
2621         /*
2622          * Unbind the DMA memory for the region
2623          *
2624          * Note: The only way ddi_dma_unbind_handle() currently
2625          * can return an error is if the handle passed in is invalid.
2626          * Since this should never happen, we choose to return void
2627          * from this function!  If this does return an error, however,
2628          * then we print a warning message to the console.
2629          */
2630         status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2631         if (status != DDI_SUCCESS) {
2632                 TAVOR_WARNING(state, "failed to unbind DMA mapping");
2633                 TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail,
2634                     TAVOR_TNF_ERROR, "");
2635                 TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2636                 return;
2637         }
2638 
2639         /* Free up the DMA handle */
2640         if (bind->bi_free_dmahdl != 0) {
2641                 ddi_dma_free_handle(&bind->bi_dmahdl);
2642         }
2643 
2644         TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2645 }
2646 
2647 
2648 /*
2649  * tavor_mr_fast_mtt_write()
2650  *    Context: Can be called from interrupt or base context.
2651  */
2652 static int
2653 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2654     uint32_t mtt_pgsize_bits)
2655 {
2656         ddi_dma_cookie_t        dmacookie;
2657         uint_t                  cookie_cnt;
2658         uint64_t                *mtt_table;
2659         uint64_t                mtt_entry;
2660         uint64_t                addr, endaddr;
2661         uint64_t                pagesize;
2662         int                     i;
2663 
2664         TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write);
2665 
2666         /* Calculate page size from the suggested value passed in */
2667         pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2668 
2669         /*
2670          * Walk the "cookie list" and fill in the MTT table entries
2671          */
2672         i = 0;
2673         mtt_table  = (uint64_t *)mtt->tr_addr;
2674         dmacookie  = bind->bi_dmacookie;
2675         cookie_cnt = bind->bi_cookiecnt;
2676         while (cookie_cnt-- > 0) {
2677                 addr    = dmacookie.dmac_laddress;
2678                 endaddr = addr + (dmacookie.dmac_size - 1);
2679                 addr    = addr & ~((uint64_t)pagesize - 1);
2680                 while (addr <= endaddr) {
2681                         /*
2682                          * Fill in the mapped addresses (calculated above) and
2683                          * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2684                          */
2685                         mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2686                         ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2687                         addr += pagesize;
2688                         i++;
2689 
2690                         if (addr == 0) {
2691                                 static int do_once = 1;
2692                                 if (do_once) {
2693                                         do_once = 0;
2694                                         cmn_err(CE_NOTE, "probable error in "
2695                                             "dma_cookie address from caller\n");
2696                                 }
2697                                 break;
2698                         }
2699                 }
2700 
2701                 /*
2702                  * When we've reached the end of the current DMA cookie,
2703                  * jump to the next cookie (if there are more)
2704                  */
2705                 if (cookie_cnt != 0) {
2706                         ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2707                 }
2708         }
2709 
2710         TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write);
2711         return (DDI_SUCCESS);
2712 }
2713 
2714 /*
2715  * tavor_mtt_refcnt_inc()
2716  *    Context: Can be called from interrupt or base context.
2717  */
2718 static int
2719 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
2720 {
2721         tavor_sw_refcnt_t *rc;
2722         uint32_t          cnt;
2723 
2724         rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2725 
2726         /* Increment the MTT's reference count */
2727         mutex_enter(&rc->swrc_lock);
2728         TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "",
2729             tnf_uint, refcnt, rc->swrc_refcnt);
2730         cnt = rc->swrc_refcnt++;
2731         mutex_exit(&rc->swrc_lock);
2732 
2733         return (cnt);
2734 }
2735 
2736 
2737 /*
2738  * tavor_mtt_refcnt_dec()
2739  *    Context: Can be called from interrupt or base context.
2740  */
2741 static int
2742 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
2743 {
2744         tavor_sw_refcnt_t *rc;
2745         uint32_t          cnt;
2746 
2747         rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2748 
2749         /* Decrement the MTT's reference count */
2750         mutex_enter(&rc->swrc_lock);
2751         cnt = --rc->swrc_refcnt;
2752         TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "",
2753             tnf_uint, refcnt, rc->swrc_refcnt);
2754         mutex_exit(&rc->swrc_lock);
2755 
2756         return (cnt);
2757 }