Print this page
7813 mpt_sas does not like concurrent HBA resets

@@ -554,11 +554,11 @@
         mptsas_quiesce          /* quiesce */
 #endif  /* __sparc */
 };
 
 
-#define MPTSAS_MOD_STRING "MPTSAS HBA Driver 00.00.00.24"
+#define MPTSAS_MOD_STRING "MPTSAS HBA Driver 00.00.00.24X"
 
 static struct modldrv modldrv = {
         &mod_driverops, /* Type of module. This one is a driver */
         MPTSAS_MOD_STRING, /* Name of the module. */
         &mptsas_ops,    /* driver ops */

@@ -1333,10 +1333,16 @@
         MPTSAS_DISABLE_INTR(mpt);
         if (mptsas_register_intrs(mpt) == FALSE)
                 goto fail;
         intr_added++;
 
+        /*
+         * The mutex to protect task management during reset
+         */
+        mutex_init(&mpt->m_taskmgmt_mutex, NULL, MUTEX_SPIN,
+            DDI_INTR_PRI(mpt->m_intr_pri));
+
         /* Initialize mutex used in interrupt handler */
         mutex_init(&mpt->m_mutex, NULL, MUTEX_DRIVER,
             DDI_INTR_PRI(mpt->m_intr_pri));
         mutex_init(&mpt->m_passthru_mutex, NULL, MUTEX_DRIVER, NULL);
         mutex_init(&mpt->m_tx_waitq_mutex, NULL, MUTEX_DRIVER,

@@ -1627,10 +1633,11 @@
                 }
                 if (mutex_init_done) {
                         mutex_destroy(&mpt->m_tx_waitq_mutex);
                         mutex_destroy(&mpt->m_passthru_mutex);
                         mutex_destroy(&mpt->m_mutex);
+                        mutex_destroy(&mpt->m_taskmgmt_mutex);
                         for (i = 0; i < MPTSAS_MAX_PHYS; i++) {
                                 mutex_destroy(
                                     &mpt->m_phy_info[i].smhba_info.phy_mutex);
                         }
                         cv_destroy(&mpt->m_cv);

@@ -2046,10 +2053,11 @@
         }
 
         mutex_destroy(&mpt->m_tx_waitq_mutex);
         mutex_destroy(&mpt->m_passthru_mutex);
         mutex_destroy(&mpt->m_mutex);
+        mutex_destroy(&mpt->m_taskmgmt_mutex);
         for (i = 0; i < MPTSAS_MAX_PHYS; i++) {
                 mutex_destroy(&mpt->m_phy_info[i].smhba_info.phy_mutex);
         }
         cv_destroy(&mpt->m_cv);
         cv_destroy(&mpt->m_passthru_cv);

@@ -2407,11 +2415,11 @@
                  * If IOC is not in operational state, try to hard reset it.
                  */
                 if ((ioc_status & MPI2_IOC_STATE_MASK) !=
                     MPI2_IOC_STATE_OPERATIONAL) {
                         mpt->m_softstate &= ~MPTSAS_SS_MSG_UNIT_RESET;
-                        if (mptsas_restart_ioc(mpt) == DDI_FAILURE) {
+                        if (mptsas_reset_handler(mpt) == DDI_FAILURE) {
                                 mptsas_log(mpt, CE_WARN,
                                     "mptsas_power: hard reset failed");
                                 mutex_exit(&mpt->m_mutex);
                                 return (DDI_FAILURE);
                         }

@@ -3424,20 +3432,24 @@
          * which means that they could be invalid even if the target is still
          * attached.  Check if being reset and if DevHandle is being
          * re-initialized.  If this is the case, return BUSY so the I/O can be
          * retried later.
          */
+        mutex_enter(&mpt->m_taskmgmt_mutex);
         if ((ptgt->m_devhdl == MPTSAS_INVALID_DEVHDL) && mpt->m_in_reset) {
                 mptsas_set_pkt_reason(mpt, cmd, CMD_RESET, STAT_BUS_RESET);
                 if (cmd->cmd_flags & CFLAG_TXQ) {
                         mptsas_doneq_add(mpt, cmd);
                         mptsas_doneq_empty(mpt);
+                        mutex_exit(&mpt->m_taskmgmt_mutex);
                         return (rval);
                 } else {
+                        mutex_exit(&mpt->m_taskmgmt_mutex);
                         return (TRAN_BUSY);
                 }
         }
+        mutex_exit(&mpt->m_taskmgmt_mutex);
 
         /*
          * If device handle has already been invalidated, just
          * fail the command. In theory, command from scsi_vhci
          * client is impossible send down command with invalid

@@ -3689,17 +3701,20 @@
                          * need to increase the reference counter here.  In a
                          * case the HBA is in reset we just simply free the
                          * allocated packet and bail out.
                          */
                         mutex_enter(&mpt->m_mutex);
-                        if (mpt->m_in_reset) {
+                        mutex_enter(&mpt->m_taskmgmt_mutex);
+                        if (mpt->m_in_reset == TRUE) {
+                                mutex_exit(&mpt->m_taskmgmt_mutex);
                                 mutex_exit(&mpt->m_mutex);
 
                                 cmd->cmd_flags = CFLAG_FREE;
                                 kmem_cache_free(mpt->m_kmem_cache, cmd);
                                 return (NULL);
                         }
+                        mutex_exit(&mpt->m_taskmgmt_mutex);
                         mpt->m_extreq_sense_refcount++;
                         ASSERT(mpt->m_extreq_sense_refcount > 0);
                         mutex_exit(&mpt->m_mutex);
 
                         /*

@@ -5370,14 +5385,17 @@
                          * and ack would be sent in taskq thread
                          */
                         NDBG20(("send mptsas_handle_event_sync success"));
                 }
 
-                if (mpt->m_in_reset) {
+                mutex_enter(&mpt->m_taskmgmt_mutex);
+                if (mpt->m_in_reset == TRUE) {
                         NDBG20(("dropping event received during reset"));
+                        mutex_exit(&mpt->m_taskmgmt_mutex);
                         return;
                 }
+                mutex_exit(&mpt->m_taskmgmt_mutex);
 
                 if ((ddi_taskq_dispatch(mpt->m_event_taskq, mptsas_handle_event,
                     (void *)args, DDI_NOSLEEP)) != DDI_SUCCESS) {
                         mptsas_log(mpt, CE_WARN, "No memory available"
                         "for dispatch taskq");

@@ -6340,14 +6358,16 @@
                 mutex_enter(&mpt->m_mutex);
                 /*
                  * If HBA is being reset, don't perform operations depending
                  * on the IOC. We must free the topo list, however.
                  */
-                if (!mpt->m_in_reset)
+
+                mutex_enter(&mpt->m_taskmgmt_mutex);
+                if (mpt->m_in_reset == FALSE)
                         mptsas_handle_topo_change(topo_node, parent);
-                else
-                        NDBG20(("skipping topo change received during reset"));
+                mutex_exit(&mpt->m_taskmgmt_mutex);
+
                 save_node = topo_node;
                 topo_node = topo_node->next;
                 ASSERT(save_node);
                 kmem_free(save_node, sizeof (mptsas_topo_change_list_t));
                 mutex_exit(&mpt->m_mutex);

@@ -7612,15 +7632,18 @@
 
         mutex_enter(&mpt->m_mutex);
         /*
          * If HBA is being reset, drop incoming event.
          */
-        if (mpt->m_in_reset) {
+        mutex_enter(&mpt->m_taskmgmt_mutex);
+        if (mpt->m_in_reset == TRUE) {
                 NDBG20(("dropping event received prior to reset"));
+                mutex_exit(&mpt->m_taskmgmt_mutex);
                 mutex_exit(&mpt->m_mutex);
                 return;
         }
+        mutex_exit(&mpt->m_taskmgmt_mutex);
 
         eventreply = (pMpi2EventNotificationReply_t)
             (mpt->m_reply_frame + (rfm -
             (mpt->m_reply_frame_dma_addr & 0xffffffffu)));
         event = ddi_get16(mpt->m_acc_reply_frame_hdl, &eventreply->Event);

@@ -9920,11 +9943,11 @@
                 if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
                         doorbell &= MPI2_DOORBELL_DATA_MASK;
                         mptsas_log(mpt, CE_WARN, "MPT Firmware Fault, "
                             "code: %04x", doorbell);
                         mpt->m_softstate &= ~MPTSAS_SS_MSG_UNIT_RESET;
-                        if ((mptsas_restart_ioc(mpt)) == DDI_FAILURE) {
+                        if ((mptsas_reset_handler(mpt)) == DDI_FAILURE) {
                                 mptsas_log(mpt, CE_WARN, "Reset failed"
                                     "after fault was detected");
                         }
                 }
 

@@ -11177,12 +11200,12 @@
                         status = EFAULT;
                 }
                 mptsas_dma_free(&dataout_dma_state);
         }
         if (pt_flags & MPTSAS_CMD_TIMEOUT) {
-                if ((mptsas_restart_ioc(mpt)) == DDI_FAILURE) {
-                        mptsas_log(mpt, CE_WARN, "mptsas_restart_ioc failed");
+                if ((mptsas_reset_handler(mpt)) == DDI_FAILURE) {
+                        mptsas_log(mpt, CE_WARN, "mptsas_reset_handler failed");
                 }
         }
         if (request_msg)
                 kmem_free(request_msg, request_size);
         NDBG27(("mptsas_do_passthru: Done status 0x%x", status));

@@ -12683,11 +12706,11 @@
                         /*
                          * Reset the chip to start using the new
                          * firmware.  Reset if failed also.
                          */
                         mpt->m_softstate &= ~MPTSAS_SS_MSG_UNIT_RESET;
-                        if (mptsas_restart_ioc(mpt) == DDI_FAILURE) {
+                        if (mptsas_reset_handler(mpt) == DDI_FAILURE) {
                                 status = EFAULT;
                         }
                         mutex_exit(&mpt->m_mutex);
                         break;
                 case MPTIOCTL_PASS_THRU:

@@ -12755,11 +12778,11 @@
                         }
                         break;
                 case MPTIOCTL_RESET_ADAPTER:
                         mutex_enter(&mpt->m_mutex);
                         mpt->m_softstate &= ~MPTSAS_SS_MSG_UNIT_RESET;
-                        if ((mptsas_restart_ioc(mpt)) == DDI_FAILURE) {
+                        if ((mptsas_reset_handler(mpt)) == DDI_FAILURE) {
                                 mptsas_log(mpt, CE_WARN, "reset adapter IOCTL "
                                     "failed");
                                 status = EFAULT;
                         }
                         mutex_exit(&mpt->m_mutex);

@@ -12822,25 +12845,31 @@
 out:
         return (status);
 }
 
 int
-mptsas_restart_ioc(mptsas_t *mpt)
+mptsas_reset_handler(mptsas_t *mpt)
 {
         int             rval = DDI_SUCCESS;
         mptsas_target_t *ptgt = NULL;
 
         ASSERT(mutex_owned(&mpt->m_mutex));
 
         /*
-         * Set a flag telling I/O path that we're processing a reset.  This is
-         * needed because after the reset is complete, the hash table still
+         * Set a flag telling task management we are processing a reset.  This
+         * is needed because after the reset is complete, the hash table still
          * needs to be rebuilt.  If I/Os are started before the hash table is
          * rebuilt, I/O errors will occur.  This flag allows I/Os to be marked
          * so that they can be retried.
          */
+        mutex_enter(&mpt->m_taskmgmt_mutex);
+        if (mpt->m_in_reset == TRUE) {
+                mutex_exit(&mpt->m_taskmgmt_mutex);
+                return (DDI_FAILURE);
+        }
         mpt->m_in_reset = TRUE;
+        mutex_exit(&mpt->m_taskmgmt_mutex);
 
         /*
          * Wait until all the allocated sense data buffers for DMA are freed.
          */
         while (mpt->m_extreq_sense_refcount > 0)

@@ -12901,11 +12930,13 @@
         }
 
         /*
          * Clear the reset flag so that I/Os can continue.
          */
+        mutex_enter(&mpt->m_taskmgmt_mutex);
         mpt->m_in_reset = FALSE;
+        mutex_exit(&mpt->m_taskmgmt_mutex);
 
         return (rval);
 }
 
 static int