Print this page
7351 NVMe driver sporadically lost track of completed I/O request, which
leads to zpool hanging and machine panic.

@@ -180,10 +180,11 @@
 #include <sys/disp.h>
 #include <sys/blkdev.h>
 #include <sys/atomic.h>
 #include <sys/archsystm.h>
 #include <sys/sata/sata_hba.h>
+#include <sys/time.h>
 
 #include "nvme_reg.h"
 #include "nvme_var.h"
 
 

@@ -209,11 +210,11 @@
 static void nvme_free_cmd(nvme_cmd_t *);
 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
     bd_xfer_t *);
 static int nvme_admin_cmd(nvme_cmd_t *, int);
 static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
-static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
+static int nvme_process_cq_cmds(nvme_t *, nvme_qpair_t *);
 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 static void nvme_wakeup_cmd(void *);
 static void nvme_async_event_task(void *);
 
 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);

@@ -257,13 +258,23 @@
 static int nvme_bd_sync(void *, bd_xfer_t *);
 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
 
 static void nvme_prepare_devid(nvme_t *, uint32_t);
 
+static void nvme_intr_monitor(void *arg);
+
 static void *nvme_state;
 static kmem_cache_t *nvme_cmd_cache;
 
+static list_t nvme_qp_list;
+static kmutex_t nvme_global_mutex;
+static ddi_periodic_t nvme_cyclic;
+int nvme_cyclic_seconds = 5;
+hrtime_t nvme_intr_timeout_ns = 3 * NANOSEC;
+uint64_t nvme_intr_timeouts = 0;
+boolean_t nvme_enable_intr_monitoring = B_TRUE;
+
 /*
  * DMA attributes for queue DMA memory
  *
  * Queue DMA memory must be page aligned. The maximum length of a queue is
  * 65536 entries, and an entry can be 64 bytes long.

@@ -380,10 +391,18 @@
                 return (error);
 
         nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
             sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 
+        mutex_init(&nvme_global_mutex, NULL, MUTEX_DRIVER, 0);
+
+        list_create(&nvme_qp_list, sizeof (nvme_qpair_t),
+            offsetof(nvme_qpair_t, nq_list_node));
+
+        nvme_cyclic = ddi_periodic_add(nvme_intr_monitor, NULL,
+            NANOSEC * nvme_cyclic_seconds, DDI_IPL_0);
+
         bd_mod_init(&nvme_dev_ops);
 
         error = mod_install(&nvme_modlinkage);
         if (error != DDI_SUCCESS) {
                 ddi_soft_state_fini(&nvme_state);

@@ -400,10 +419,15 @@
 
         error = mod_remove(&nvme_modlinkage);
         if (error == DDI_SUCCESS) {
                 ddi_soft_state_fini(&nvme_state);
                 kmem_cache_destroy(nvme_cmd_cache);
+                if (nvme_cyclic != NULL) {
+                        ddi_periodic_delete(nvme_cyclic);
+                        nvme_cyclic = NULL;
+                }
+                mutex_destroy(&nvme_global_mutex);
                 bd_mod_fini(&nvme_dev_ops);
         }
 
         return (error);
 }

@@ -704,57 +728,72 @@
         qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 
         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 
+        if (nvme_enable_intr_monitoring)
+                qp->nq_ts = gethrtime();
         mutex_exit(&qp->nq_mutex);
+
         return (DDI_SUCCESS);
 }
 
-static nvme_cmd_t *
-nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
+static int
+nvme_process_cq_cmds(nvme_t *nvme, nvme_qpair_t *qp)
 {
         nvme_reg_cqhdbl_t head = { 0 };
 
         nvme_cqe_t *cqe;
         nvme_cmd_t *cmd;
+        int cnt_cmds = 0;
 
         (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
             sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
 
         cqe = &qp->nq_cq[qp->nq_cqhead];
-
         /* Check phase tag of CQE. Hardware inverts it for new entries. */
         if (cqe->cqe_sf.sf_p == qp->nq_phase)
-                return (NULL);
+                return (cnt_cmds);
 
+        mutex_enter(&qp->nq_mutex);
+        while (cqe->cqe_sf.sf_p != qp->nq_phase) {
         ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
         ASSERT(cqe->cqe_cid < qp->nq_nentry);
 
-        mutex_enter(&qp->nq_mutex);
         cmd = qp->nq_cmd[cqe->cqe_cid];
         qp->nq_cmd[cqe->cqe_cid] = NULL;
         qp->nq_active_cmds--;
-        mutex_exit(&qp->nq_mutex);
 
         ASSERT(cmd != NULL);
         ASSERT(cmd->nc_nvme == nvme);
         ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
         ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
         bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 
         qp->nq_sqhead = cqe->cqe_sqhd;
 
-        head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
+                qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 
         /* Toggle phase on wrap-around. */
         if (qp->nq_cqhead == 0)
                 qp->nq_phase = qp->nq_phase ? 0 : 1;
+                taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
+                     cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+                cnt_cmds++;
+                cqe = &qp->nq_cq[qp->nq_cqhead];
+        }
 
-        nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
+        if (cnt_cmds != 0) {
+                head.b.cqhdbl_cqh = qp->nq_cqhead;
+                nvme_put32(nvme, qp->nq_cqhdbl, head.r);
+                if (nvme_enable_intr_monitoring)
+                        qp->nq_ts = gethrtime();
+        }
 
-        return (cmd);
+        mutex_exit(&qp->nq_mutex);
+
+        return (cnt_cmds);
 }
 
 static int
 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 {

@@ -1655,10 +1694,15 @@
                 return (DDI_FAILURE);
         }
 
         nvme_free_cmd(cmd);
 
+        mutex_enter(&nvme_global_mutex);
+        list_insert_head(&nvme_qp_list, qp);
+        qp->nq_nvme = nvme;
+        mutex_exit(&nvme_global_mutex);
+
         return (DDI_SUCCESS);
 }
 
 static boolean_t
 nvme_reset(nvme_t *nvme, boolean_t quiesce)

@@ -2182,10 +2226,11 @@
         nvme_t *nvme = (nvme_t *)arg1;
         int inum = (int)(uintptr_t)arg2;
         int ccnt = 0;
         int qnum;
         nvme_cmd_t *cmd;
+        int cnt_cmds;
 
         if (inum >= nvme->n_intr_cnt)
                 return (DDI_INTR_UNCLAIMED);
 
         /*

@@ -2194,15 +2239,12 @@
          * in steps of n_intr_cnt to process all queues using this vector.
          */
         for (qnum = inum;
             qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
             qnum += nvme->n_intr_cnt) {
-                while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
-                        taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
-                            cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
-                        ccnt++;
-                }
+                cnt_cmds =  nvme_process_cq_cmds(nvme, nvme->n_ioq[qnum]);
+                ccnt += cnt_cmds;
         }
 
         return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
 }
 

@@ -2529,10 +2571,13 @@
 
         if (nvme->n_ioq_count > 0) {
                 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
                         if (nvme->n_ioq[i] != NULL) {
                                 /* TODO: send destroy queue commands */
+                                mutex_enter(&nvme_global_mutex);
+                                list_remove(&nvme_qp_list, nvme->n_ioq[i]);
+                                mutex_exit(&nvme_global_mutex);
                                 nvme_free_qpair(nvme->n_ioq[i]);
                         }
                 }
 
                 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *

@@ -2842,5 +2887,28 @@
         nvme_namespace_t *ns = arg;
 
         return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
             ns->ns_devid, devid));
 }
+
+static void
+nvme_intr_monitor(void *arg)
+{
+        nvme_qpair_t *qp;
+        hrtime_t diff, now_ns;
+
+        if (!nvme_enable_intr_monitoring)
+                return;
+        mutex_enter(&nvme_global_mutex);
+        now_ns = gethrtime();
+        for (qp = list_head(&nvme_qp_list); qp != NULL;
+            qp = list_next(&nvme_qp_list, qp)) {
+                diff = now_ns - qp->nq_ts;
+                if (diff >= nvme_intr_timeout_ns && qp->nq_active_cmds > 0) {
+                        if (nvme_process_cq_cmds(qp->nq_nvme, qp)) {
+                                nvme_intr_timeouts++;
+                                qp->nq_nvme->n_intr_timeouts++;
+                        }
+                }
+        }
+        mutex_exit(&nvme_global_mutex);
+}