Print this page
8368 remove warlock leftovers from usr/src/uts


  61         ibt_srq_flags_t         flags;
  62         tavor_rsrc_t            *srqc, *rsrc;
  63         tavor_hw_srqc_t         srqc_entry;
  64         uint32_t                *buf;
  65         tavor_srqhdl_t          srq;
  66         tavor_umap_db_entry_t   *umapdb;
  67         ibt_mr_attr_t           mr_attr;
  68         tavor_mr_options_t      mr_op;
  69         tavor_mrhdl_t           mr;
  70         uint64_t                addr;
  71         uint64_t                value, srq_desc_off;
  72         uint32_t                lkey;
  73         uint32_t                log_srq_size;
  74         uint32_t                uarpg;
  75         uint_t                  wq_location, dma_xfer_mode, srq_is_umap;
  76         int                     flag, status;
  77         char                    *errormsg;
  78         uint_t                  max_sgl;
  79         uint_t                  wqesz;
  80 
  81         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
  82 
  83         TAVOR_TNF_ENTER(tavor_srq_alloc);
  84 
  85         /*
  86          * Check the "options" flag.  Currently this flag tells the driver
  87          * whether or not the SRQ's work queues should be come from normal
  88          * system memory or whether they should be allocated from DDR memory.
  89          */
  90         if (op == NULL) {
  91                 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
  92         } else {
  93                 wq_location = op->srqo_wq_loc;
  94         }
  95 
  96         /*
  97          * Extract the necessary info from the tavor_srq_info_t structure
  98          */
  99         real_sizes = srqinfo->srqi_real_sizes;
 100         sizes      = srqinfo->srqi_sizes;
 101         pd         = srqinfo->srqi_pd;
 102         ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;


 126         /* Increase PD refcnt */
 127         tavor_pd_refcnt_inc(pd);
 128 
 129         /* Allocate an SRQ context entry */
 130         status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
 131         if (status != DDI_SUCCESS) {
 132                 /* Set "status" and "errormsg" and goto failure */
 133                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
 134                 goto srqalloc_fail1;
 135         }
 136 
 137         /* Allocate the SRQ Handle entry */
 138         status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
 139         if (status != DDI_SUCCESS) {
 140                 /* Set "status" and "errormsg" and goto failure */
 141                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
 142                 goto srqalloc_fail2;
 143         }
 144 
 145         srq = (tavor_srqhdl_t)rsrc->tr_addr;
 146         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
 147 
 148         srq->srq_srqnum = srqc->tr_indx;  /* just use index */
 149 
 150         /*
 151          * If this will be a user-mappable SRQ, then allocate an entry for
 152          * the "userland resources database".  This will later be added to
 153          * the database (after all further SRQ operations are successful).
 154          * If we fail here, we must undo the reference counts and the
 155          * previous resource allocation.
 156          */
 157         if (srq_is_umap) {
 158                 umapdb = tavor_umap_db_alloc(state->ts_instance,
 159                     srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
 160                     (uint64_t)(uintptr_t)rsrc);
 161                 if (umapdb == NULL) {
 162                         /* Set "status" and "errormsg" and goto failure */
 163                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 164                         goto srqalloc_fail3;
 165                 }
 166         }


 240          * a zero-based queue.  By making sure we are aligned on at least a
 241          * page, any offset we use into our queue will be the same as when we
 242          * perform tavor_srq_modify() operations later.
 243          */
 244         wqesz = (1 << srq->srq_wq_log_wqesz);
 245         srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
 246         srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
 247         srq->srq_wqinfo.qa_bind_align = PAGESIZE;
 248         if (srq_is_umap) {
 249                 srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 250         } else {
 251                 srq->srq_wqinfo.qa_location = wq_location;
 252         }
 253         status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
 254         if (status != DDI_SUCCESS) {
 255                 /* Set "status" and "errormsg" and goto failure */
 256                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 257                 goto srqalloc_fail4;
 258         }
 259         buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
 260         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
 261 
 262         /*
 263          * Register the memory for the SRQ work queues.  The memory for the SRQ
 264          * must be registered in the Tavor TPT tables.  This gives us the LKey
 265          * to specify in the SRQ context later.  Note: If the work queue is to
 266          * be allocated from DDR memory, then only a "bypass" mapping is
 267          * appropriate.  And if the SRQ memory is user-mappable, then we force
 268          * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
 269          * restriction, we pass the "mro_bind_override_addr" flag in the call
 270          * to tavor_mr_register().  This guarantees that the resulting IB vaddr
 271          * will be zero-based (modulo the offset into the first page).  If we
 272          * fail here, we still have the bunch of resource and reference count
 273          * cleanup to do.
 274          */
 275         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
 276             IBT_MR_NOSLEEP;
 277         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 278         mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
 279         mr_attr.mr_as    = NULL;
 280         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;


 284                 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 285                         mr_op.mro_bind_type =
 286                             state->ts_cfg_profile->cp_iommu_bypass;
 287                         dma_xfer_mode =
 288                             state->ts_cfg_profile->cp_streaming_consistent;
 289                         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 290                                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 291                         }
 292                 } else {
 293                         mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
 294                 }
 295         }
 296         mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
 297         mr_op.mro_bind_override_addr = 1;
 298         status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
 299         if (status != DDI_SUCCESS) {
 300                 /* Set "status" and "errormsg" and goto failure */
 301                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 302                 goto srqalloc_fail5;
 303         }
 304         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
 305         addr = mr->mr_bindinfo.bi_addr;
 306         lkey = mr->mr_lkey;
 307 
 308         /*
 309          * Calculate the offset between the kernel virtual address space
 310          * and the IB virtual address space.  This will be used when
 311          * posting work requests to properly initialize each WQE.
 312          */
 313         srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
 314             (uint64_t)mr->mr_bindinfo.bi_addr;
 315 
 316         /*
 317          * Create WQL and Wridlist for use by this SRQ
 318          */
 319         srq->srq_wrid_wql = tavor_wrid_wql_create(state);
 320         if (srq->srq_wrid_wql == NULL) {
 321                 /* Set "status" and "errormsg" and goto failure */
 322                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
 323                 goto srqalloc_fail6;
 324         }
 325         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
 326 
 327         srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
 328         if (srq->srq_wridlist == NULL) {
 329                 /* Set "status" and "errormsg" and goto failure */
 330                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
 331                 goto srqalloc_fail7;
 332         }
 333         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
 334 
 335         srq->srq_wridlist->wl_srq_en = 1;
 336         srq->srq_wridlist->wl_free_list_indx = -1;
 337 
 338         /*
 339          * Fill in all the return arguments (if necessary).  This includes
 340          * real queue size and real SGLs.
 341          */
 342         if (real_sizes != NULL) {
 343                 real_sizes->srq_wr_sz = (1 << log_srq_size);
 344                 real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
 345         }
 346 
 347         /*
 348          * Fill in the SRQC entry.  This is the final step before passing
 349          * ownership of the SRQC entry to the Tavor hardware.  We use all of
 350          * the information collected/calculated above to fill in the
 351          * requisite portions of the SRQC.  Note: If this SRQ is going to be
 352          * used for userland access, then we need to set the UAR page number
 353          * appropriately (otherwise it's a "don't care")


 543                             maxprot, DEVMAP_MAPPING_INVALID, NULL);
 544                         if (status != DDI_SUCCESS) {
 545                                 mutex_exit(&srq->srq_lock);
 546                                 TAVOR_WARNING(state, "failed in SRQ memory "
 547                                     "devmap_devmem_remap()");
 548                                 TAVOR_TNF_EXIT(tavor_srq_free);
 549                                 return (ibc_get_ci_failure(0));
 550                         }
 551                         srq->srq_umap_dhp = (devmap_cookie_t)NULL;
 552                 }
 553         }
 554 
 555         /*
 556          * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
 557          * in-progress events to detect that the SRQ corresponding to this
 558          * number has been freed.
 559          */
 560         state->ts_srqhdl[srqc->tr_indx] = NULL;
 561 
 562         mutex_exit(&srq->srq_lock);
 563         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
 564         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
 565 
 566         /*
 567          * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
 568          * firmware command).  If the ownership transfer fails for any reason,
 569          * then it is an indication that something (either in HW or SW) has
 570          * gone seriously wrong.
 571          */
 572         status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
 573             sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
 574         if (status != TAVOR_CMD_SUCCESS) {
 575                 TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
 576                 cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
 577                     status);
 578                 TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
 579                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 580                 TAVOR_TNF_EXIT(tavor_srq_free);
 581                 return (IBT_FAILURE);
 582         }
 583 
 584         /*


 712          * for a zero-based queue.  By making sure we are aligned on at least a
 713          * page, any offset we use into our queue will be the same as it was
 714          * when we allocated it at tavor_srq_alloc() time.
 715          */
 716         wqesz = (1 << srq->srq_wq_log_wqesz);
 717         new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
 718         new_srqinfo.qa_alloc_align = PAGESIZE;
 719         new_srqinfo.qa_bind_align  = PAGESIZE;
 720         if (srq->srq_is_umap) {
 721                 new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 722         } else {
 723                 new_srqinfo.qa_location = wq_location;
 724         }
 725         status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
 726         if (status != DDI_SUCCESS) {
 727                 /* Set "status" and "errormsg" and goto failure */
 728                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 729                 goto srqmodify_fail;
 730         }
 731         buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
 732         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
 733 
 734         /*
 735          * Allocate the memory for the new WRE list.  This will be used later
 736          * when we resize the wridlist based on the new SRQ size.
 737          */
 738         wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
 739             sizeof (tavor_wrid_entry_t), sleepflag);
 740         if (wre_new == NULL) {
 741                 /* Set "status" and "errormsg" and goto failure */
 742                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
 743                     "failed wre_new alloc");
 744                 goto srqmodify_fail;
 745         }
 746 
 747         /*
 748          * Fill in the "bind" struct.  This struct provides the majority
 749          * of the information that will be used to distinguish between an
 750          * "addr" binding (as is the case here) and a "buf" binding (see
 751          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 752          * which does most of the "heavy lifting" for the Tavor memory
 753          * registration routines.
 754          */
 755         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
 756         bzero(&bind, sizeof (tavor_bind_info_t));
 757         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 758         bind.bi_addr  = (uint64_t)(uintptr_t)buf;
 759         bind.bi_len   = new_srqinfo.qa_size;
 760         bind.bi_as    = NULL;
 761         bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
 762             IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
 763         if (srq->srq_is_umap) {
 764                 bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
 765         } else {
 766                 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 767                         bind.bi_bypass =
 768                             state->ts_cfg_profile->cp_iommu_bypass;
 769                         dma_xfer_mode =
 770                             state->ts_cfg_profile->cp_streaming_consistent;
 771                         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 772                                 bind.bi_flags |= IBT_MR_NONCOHERENT;
 773                         }
 774                 } else {
 775                         bind.bi_bypass = TAVOR_BINDMEM_BYPASS;


 882          * information for freeing up the old resources
 883          */
 884         old_srqinfo        = srq->srq_wqinfo;
 885         old_mtt            = srq->srq_mrhdl->mr_mttrsrcp;
 886         bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
 887             sizeof (tavor_bind_info_t));
 888 
 889         /* Now set the new info */
 890         srq->srq_wqinfo         = new_srqinfo;
 891         srq->srq_wq_buf         = buf;
 892         srq->srq_wq_bufsz  = (1 << log_srq_size);
 893         bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
 894         srq->srq_mrhdl->mr_mttrsrcp = mtt;
 895         srq->srq_desc_off  = srq_desc_off;
 896         srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
 897 
 898         /* Update MR mtt pagesize */
 899         mr->mr_logmttpgsz = mtt_pgsize_bits;
 900         mutex_exit(&mr->mr_lock);
 901 
 902 #ifdef __lock_lint
 903         mutex_enter(&srq->srq_wrid_wql->wql_lock);
 904 #else
 905         if (srq->srq_wrid_wql != NULL) {
 906                 mutex_enter(&srq->srq_wrid_wql->wql_lock);
 907         }
 908 #endif
 909 
 910         /*
 911          * Initialize new wridlist, if needed.
 912          *
 913          * If a wridlist already is setup on an SRQ (the QP associated with an
 914          * SRQ has moved "from_reset") then we must update this wridlist based
 915          * on the new SRQ size.  We allocate the new size of Work Request ID
 916          * Entries, copy over the old entries to the new list, and
 917          * re-initialize the srq wridlist in non-umap case
 918          */
 919         wre_old = NULL;
 920         if (srq->srq_wridlist != NULL) {
 921                 wre_old = srq->srq_wridlist->wl_wre;
 922 
 923                 bcopy(wre_old, wre_new, srq_old_bufsz *
 924                     sizeof (tavor_wrid_entry_t));
 925 
 926                 /* Setup new sizes in wre */
 927                 srq->srq_wridlist->wl_wre = wre_new;
 928                 srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
 929 
 930                 if (!srq->srq_is_umap) {
 931                         tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
 932                             srq_old_bufsz);
 933                 }
 934         }
 935 
 936 #ifdef __lock_lint
 937         mutex_exit(&srq->srq_wrid_wql->wql_lock);
 938 #else
 939         if (srq->srq_wrid_wql != NULL) {
 940                 mutex_exit(&srq->srq_wrid_wql->wql_lock);
 941         }
 942 #endif
 943 
 944         /*
 945          * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
 946          * to a user process, then we need to call devmap_devmem_remap() to
 947          * invalidate the mapping to the SRQ memory.  We also need to
 948          * invalidate the SRQ tracking information for the user mapping.
 949          *
 950          * Note: On failure, the remap really shouldn't ever happen.  So, if it
 951          * does, it is an indication that something has gone seriously wrong.
 952          * So we print a warning message and return error (knowing, of course,
 953          * that the "old" SRQ memory will be leaked)
 954          */
 955         if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
 956                 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 957                 status = devmap_devmem_remap(srq->srq_umap_dhp,
 958                     state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
 959                     DEVMAP_MAPPING_INVALID, NULL);
 960                 if (status != DDI_SUCCESS) {
 961                         mutex_exit(&srq->srq_lock);
 962                         TAVOR_WARNING(state, "failed in SRQ memory "




  61         ibt_srq_flags_t         flags;
  62         tavor_rsrc_t            *srqc, *rsrc;
  63         tavor_hw_srqc_t         srqc_entry;
  64         uint32_t                *buf;
  65         tavor_srqhdl_t          srq;
  66         tavor_umap_db_entry_t   *umapdb;
  67         ibt_mr_attr_t           mr_attr;
  68         tavor_mr_options_t      mr_op;
  69         tavor_mrhdl_t           mr;
  70         uint64_t                addr;
  71         uint64_t                value, srq_desc_off;
  72         uint32_t                lkey;
  73         uint32_t                log_srq_size;
  74         uint32_t                uarpg;
  75         uint_t                  wq_location, dma_xfer_mode, srq_is_umap;
  76         int                     flag, status;
  77         char                    *errormsg;
  78         uint_t                  max_sgl;
  79         uint_t                  wqesz;
  80 


  81         TAVOR_TNF_ENTER(tavor_srq_alloc);
  82 
  83         /*
  84          * Check the "options" flag.  Currently this flag tells the driver
  85          * whether or not the SRQ's work queues should be come from normal
  86          * system memory or whether they should be allocated from DDR memory.
  87          */
  88         if (op == NULL) {
  89                 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
  90         } else {
  91                 wq_location = op->srqo_wq_loc;
  92         }
  93 
  94         /*
  95          * Extract the necessary info from the tavor_srq_info_t structure
  96          */
  97         real_sizes = srqinfo->srqi_real_sizes;
  98         sizes      = srqinfo->srqi_sizes;
  99         pd         = srqinfo->srqi_pd;
 100         ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;


 124         /* Increase PD refcnt */
 125         tavor_pd_refcnt_inc(pd);
 126 
 127         /* Allocate an SRQ context entry */
 128         status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
 129         if (status != DDI_SUCCESS) {
 130                 /* Set "status" and "errormsg" and goto failure */
 131                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
 132                 goto srqalloc_fail1;
 133         }
 134 
 135         /* Allocate the SRQ Handle entry */
 136         status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
 137         if (status != DDI_SUCCESS) {
 138                 /* Set "status" and "errormsg" and goto failure */
 139                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
 140                 goto srqalloc_fail2;
 141         }
 142 
 143         srq = (tavor_srqhdl_t)rsrc->tr_addr;

 144 
 145         srq->srq_srqnum = srqc->tr_indx;  /* just use index */
 146 
 147         /*
 148          * If this will be a user-mappable SRQ, then allocate an entry for
 149          * the "userland resources database".  This will later be added to
 150          * the database (after all further SRQ operations are successful).
 151          * If we fail here, we must undo the reference counts and the
 152          * previous resource allocation.
 153          */
 154         if (srq_is_umap) {
 155                 umapdb = tavor_umap_db_alloc(state->ts_instance,
 156                     srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
 157                     (uint64_t)(uintptr_t)rsrc);
 158                 if (umapdb == NULL) {
 159                         /* Set "status" and "errormsg" and goto failure */
 160                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 161                         goto srqalloc_fail3;
 162                 }
 163         }


 237          * a zero-based queue.  By making sure we are aligned on at least a
 238          * page, any offset we use into our queue will be the same as when we
 239          * perform tavor_srq_modify() operations later.
 240          */
 241         wqesz = (1 << srq->srq_wq_log_wqesz);
 242         srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
 243         srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
 244         srq->srq_wqinfo.qa_bind_align = PAGESIZE;
 245         if (srq_is_umap) {
 246                 srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 247         } else {
 248                 srq->srq_wqinfo.qa_location = wq_location;
 249         }
 250         status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
 251         if (status != DDI_SUCCESS) {
 252                 /* Set "status" and "errormsg" and goto failure */
 253                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 254                 goto srqalloc_fail4;
 255         }
 256         buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;

 257 
 258         /*
 259          * Register the memory for the SRQ work queues.  The memory for the SRQ
 260          * must be registered in the Tavor TPT tables.  This gives us the LKey
 261          * to specify in the SRQ context later.  Note: If the work queue is to
 262          * be allocated from DDR memory, then only a "bypass" mapping is
 263          * appropriate.  And if the SRQ memory is user-mappable, then we force
 264          * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
 265          * restriction, we pass the "mro_bind_override_addr" flag in the call
 266          * to tavor_mr_register().  This guarantees that the resulting IB vaddr
 267          * will be zero-based (modulo the offset into the first page).  If we
 268          * fail here, we still have the bunch of resource and reference count
 269          * cleanup to do.
 270          */
 271         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
 272             IBT_MR_NOSLEEP;
 273         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 274         mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
 275         mr_attr.mr_as    = NULL;
 276         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;


 280                 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 281                         mr_op.mro_bind_type =
 282                             state->ts_cfg_profile->cp_iommu_bypass;
 283                         dma_xfer_mode =
 284                             state->ts_cfg_profile->cp_streaming_consistent;
 285                         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 286                                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 287                         }
 288                 } else {
 289                         mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
 290                 }
 291         }
 292         mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
 293         mr_op.mro_bind_override_addr = 1;
 294         status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
 295         if (status != DDI_SUCCESS) {
 296                 /* Set "status" and "errormsg" and goto failure */
 297                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 298                 goto srqalloc_fail5;
 299         }

 300         addr = mr->mr_bindinfo.bi_addr;
 301         lkey = mr->mr_lkey;
 302 
 303         /*
 304          * Calculate the offset between the kernel virtual address space
 305          * and the IB virtual address space.  This will be used when
 306          * posting work requests to properly initialize each WQE.
 307          */
 308         srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
 309             (uint64_t)mr->mr_bindinfo.bi_addr;
 310 
 311         /*
 312          * Create WQL and Wridlist for use by this SRQ
 313          */
 314         srq->srq_wrid_wql = tavor_wrid_wql_create(state);
 315         if (srq->srq_wrid_wql == NULL) {
 316                 /* Set "status" and "errormsg" and goto failure */
 317                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
 318                 goto srqalloc_fail6;
 319         }

 320 
 321         srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
 322         if (srq->srq_wridlist == NULL) {
 323                 /* Set "status" and "errormsg" and goto failure */
 324                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
 325                 goto srqalloc_fail7;
 326         }

 327 
 328         srq->srq_wridlist->wl_srq_en = 1;
 329         srq->srq_wridlist->wl_free_list_indx = -1;
 330 
 331         /*
 332          * Fill in all the return arguments (if necessary).  This includes
 333          * real queue size and real SGLs.
 334          */
 335         if (real_sizes != NULL) {
 336                 real_sizes->srq_wr_sz = (1 << log_srq_size);
 337                 real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
 338         }
 339 
 340         /*
 341          * Fill in the SRQC entry.  This is the final step before passing
 342          * ownership of the SRQC entry to the Tavor hardware.  We use all of
 343          * the information collected/calculated above to fill in the
 344          * requisite portions of the SRQC.  Note: If this SRQ is going to be
 345          * used for userland access, then we need to set the UAR page number
 346          * appropriately (otherwise it's a "don't care")


 536                             maxprot, DEVMAP_MAPPING_INVALID, NULL);
 537                         if (status != DDI_SUCCESS) {
 538                                 mutex_exit(&srq->srq_lock);
 539                                 TAVOR_WARNING(state, "failed in SRQ memory "
 540                                     "devmap_devmem_remap()");
 541                                 TAVOR_TNF_EXIT(tavor_srq_free);
 542                                 return (ibc_get_ci_failure(0));
 543                         }
 544                         srq->srq_umap_dhp = (devmap_cookie_t)NULL;
 545                 }
 546         }
 547 
 548         /*
 549          * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
 550          * in-progress events to detect that the SRQ corresponding to this
 551          * number has been freed.
 552          */
 553         state->ts_srqhdl[srqc->tr_indx] = NULL;
 554 
 555         mutex_exit(&srq->srq_lock);


 556 
 557         /*
 558          * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
 559          * firmware command).  If the ownership transfer fails for any reason,
 560          * then it is an indication that something (either in HW or SW) has
 561          * gone seriously wrong.
 562          */
 563         status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
 564             sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
 565         if (status != TAVOR_CMD_SUCCESS) {
 566                 TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
 567                 cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
 568                     status);
 569                 TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
 570                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 571                 TAVOR_TNF_EXIT(tavor_srq_free);
 572                 return (IBT_FAILURE);
 573         }
 574 
 575         /*


 703          * for a zero-based queue.  By making sure we are aligned on at least a
 704          * page, any offset we use into our queue will be the same as it was
 705          * when we allocated it at tavor_srq_alloc() time.
 706          */
 707         wqesz = (1 << srq->srq_wq_log_wqesz);
 708         new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
 709         new_srqinfo.qa_alloc_align = PAGESIZE;
 710         new_srqinfo.qa_bind_align  = PAGESIZE;
 711         if (srq->srq_is_umap) {
 712                 new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 713         } else {
 714                 new_srqinfo.qa_location = wq_location;
 715         }
 716         status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
 717         if (status != DDI_SUCCESS) {
 718                 /* Set "status" and "errormsg" and goto failure */
 719                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 720                 goto srqmodify_fail;
 721         }
 722         buf = (uint32_t *)new_srqinfo.qa_buf_aligned;

 723 
 724         /*
 725          * Allocate the memory for the new WRE list.  This will be used later
 726          * when we resize the wridlist based on the new SRQ size.
 727          */
 728         wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
 729             sizeof (tavor_wrid_entry_t), sleepflag);
 730         if (wre_new == NULL) {
 731                 /* Set "status" and "errormsg" and goto failure */
 732                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
 733                     "failed wre_new alloc");
 734                 goto srqmodify_fail;
 735         }
 736 
 737         /*
 738          * Fill in the "bind" struct.  This struct provides the majority
 739          * of the information that will be used to distinguish between an
 740          * "addr" binding (as is the case here) and a "buf" binding (see
 741          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 742          * which does most of the "heavy lifting" for the Tavor memory
 743          * registration routines.
 744          */

 745         bzero(&bind, sizeof (tavor_bind_info_t));
 746         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 747         bind.bi_addr  = (uint64_t)(uintptr_t)buf;
 748         bind.bi_len   = new_srqinfo.qa_size;
 749         bind.bi_as    = NULL;
 750         bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
 751             IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
 752         if (srq->srq_is_umap) {
 753                 bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
 754         } else {
 755                 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 756                         bind.bi_bypass =
 757                             state->ts_cfg_profile->cp_iommu_bypass;
 758                         dma_xfer_mode =
 759                             state->ts_cfg_profile->cp_streaming_consistent;
 760                         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 761                                 bind.bi_flags |= IBT_MR_NONCOHERENT;
 762                         }
 763                 } else {
 764                         bind.bi_bypass = TAVOR_BINDMEM_BYPASS;


 871          * information for freeing up the old resources
 872          */
 873         old_srqinfo        = srq->srq_wqinfo;
 874         old_mtt            = srq->srq_mrhdl->mr_mttrsrcp;
 875         bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
 876             sizeof (tavor_bind_info_t));
 877 
 878         /* Now set the new info */
 879         srq->srq_wqinfo         = new_srqinfo;
 880         srq->srq_wq_buf         = buf;
 881         srq->srq_wq_bufsz  = (1 << log_srq_size);
 882         bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
 883         srq->srq_mrhdl->mr_mttrsrcp = mtt;
 884         srq->srq_desc_off  = srq_desc_off;
 885         srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
 886 
 887         /* Update MR mtt pagesize */
 888         mr->mr_logmttpgsz = mtt_pgsize_bits;
 889         mutex_exit(&mr->mr_lock);
 890 



 891         if (srq->srq_wrid_wql != NULL) {
 892                 mutex_enter(&srq->srq_wrid_wql->wql_lock);
 893         }

 894 
 895         /*
 896          * Initialize new wridlist, if needed.
 897          *
 898          * If a wridlist already is setup on an SRQ (the QP associated with an
 899          * SRQ has moved "from_reset") then we must update this wridlist based
 900          * on the new SRQ size.  We allocate the new size of Work Request ID
 901          * Entries, copy over the old entries to the new list, and
 902          * re-initialize the srq wridlist in non-umap case
 903          */
 904         wre_old = NULL;
 905         if (srq->srq_wridlist != NULL) {
 906                 wre_old = srq->srq_wridlist->wl_wre;
 907 
 908                 bcopy(wre_old, wre_new, srq_old_bufsz *
 909                     sizeof (tavor_wrid_entry_t));
 910 
 911                 /* Setup new sizes in wre */
 912                 srq->srq_wridlist->wl_wre = wre_new;
 913                 srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
 914 
 915                 if (!srq->srq_is_umap) {
 916                         tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
 917                             srq_old_bufsz);
 918                 }
 919         }
 920 



 921         if (srq->srq_wrid_wql != NULL) {
 922                 mutex_exit(&srq->srq_wrid_wql->wql_lock);
 923         }

 924 
 925         /*
 926          * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
 927          * to a user process, then we need to call devmap_devmem_remap() to
 928          * invalidate the mapping to the SRQ memory.  We also need to
 929          * invalidate the SRQ tracking information for the user mapping.
 930          *
 931          * Note: On failure, the remap really shouldn't ever happen.  So, if it
 932          * does, it is an indication that something has gone seriously wrong.
 933          * So we print a warning message and return error (knowing, of course,
 934          * that the "old" SRQ memory will be leaked)
 935          */
 936         if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
 937                 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 938                 status = devmap_devmem_remap(srq->srq_umap_dhp,
 939                     state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
 940                     DEVMAP_MAPPING_INVALID, NULL);
 941                 if (status != DDI_SUCCESS) {
 942                         mutex_exit(&srq->srq_lock);
 943                         TAVOR_WARNING(state, "failed in SRQ memory "