Print this page
7351 NVMe driver sporadically lost track of completed I/O request, which
leads to zpool hanging and machine panic.
@@ -180,10 +180,11 @@
#include <sys/disp.h>
#include <sys/blkdev.h>
#include <sys/atomic.h>
#include <sys/archsystm.h>
#include <sys/sata/sata_hba.h>
+#include <sys/time.h>
#include "nvme_reg.h"
#include "nvme_var.h"
@@ -209,11 +210,11 @@
static void nvme_free_cmd(nvme_cmd_t *);
static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
bd_xfer_t *);
static int nvme_admin_cmd(nvme_cmd_t *, int);
static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
-static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
+static int nvme_process_cq_cmds(nvme_t *, nvme_qpair_t *);
static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
static void nvme_wakeup_cmd(void *);
static void nvme_async_event_task(void *);
static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
@@ -257,13 +258,23 @@
static int nvme_bd_sync(void *, bd_xfer_t *);
static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
static void nvme_prepare_devid(nvme_t *, uint32_t);
+static void nvme_intr_monitor(void *arg);
+
static void *nvme_state;
static kmem_cache_t *nvme_cmd_cache;
+static list_t nvme_qp_list;
+static kmutex_t nvme_global_mutex;
+static ddi_periodic_t nvme_cyclic;
+int nvme_cyclic_seconds = 5;
+hrtime_t nvme_intr_timeout_ns = 3 * NANOSEC;
+uint64_t nvme_intr_timeouts = 0;
+boolean_t nvme_enable_intr_monitoring = B_TRUE;
+
/*
* DMA attributes for queue DMA memory
*
* Queue DMA memory must be page aligned. The maximum length of a queue is
* 65536 entries, and an entry can be 64 bytes long.
@@ -380,10 +391,18 @@
return (error);
nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
+ mutex_init(&nvme_global_mutex, NULL, MUTEX_DRIVER, 0);
+
+ list_create(&nvme_qp_list, sizeof (nvme_qpair_t),
+ offsetof(nvme_qpair_t, nq_list_node));
+
+ nvme_cyclic = ddi_periodic_add(nvme_intr_monitor, NULL,
+ NANOSEC * nvme_cyclic_seconds, DDI_IPL_0);
+
bd_mod_init(&nvme_dev_ops);
error = mod_install(&nvme_modlinkage);
if (error != DDI_SUCCESS) {
ddi_soft_state_fini(&nvme_state);
@@ -400,10 +419,15 @@
error = mod_remove(&nvme_modlinkage);
if (error == DDI_SUCCESS) {
ddi_soft_state_fini(&nvme_state);
kmem_cache_destroy(nvme_cmd_cache);
+ if (nvme_cyclic != NULL) {
+ ddi_periodic_delete(nvme_cyclic);
+ nvme_cyclic = NULL;
+ }
+ mutex_destroy(&nvme_global_mutex);
bd_mod_fini(&nvme_dev_ops);
}
return (error);
}
@@ -704,57 +728,72 @@
qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
+ if (nvme_enable_intr_monitoring)
+ qp->nq_ts = gethrtime();
mutex_exit(&qp->nq_mutex);
+
return (DDI_SUCCESS);
}
-static nvme_cmd_t *
-nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
+static int
+nvme_process_cq_cmds(nvme_t *nvme, nvme_qpair_t *qp)
{
nvme_reg_cqhdbl_t head = { 0 };
nvme_cqe_t *cqe;
nvme_cmd_t *cmd;
+ int cnt_cmds = 0;
(void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
cqe = &qp->nq_cq[qp->nq_cqhead];
-
/* Check phase tag of CQE. Hardware inverts it for new entries. */
if (cqe->cqe_sf.sf_p == qp->nq_phase)
- return (NULL);
+ return (cnt_cmds);
+ mutex_enter(&qp->nq_mutex);
+ while (cqe->cqe_sf.sf_p != qp->nq_phase) {
ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
ASSERT(cqe->cqe_cid < qp->nq_nentry);
- mutex_enter(&qp->nq_mutex);
cmd = qp->nq_cmd[cqe->cqe_cid];
qp->nq_cmd[cqe->cqe_cid] = NULL;
qp->nq_active_cmds--;
- mutex_exit(&qp->nq_mutex);
ASSERT(cmd != NULL);
ASSERT(cmd->nc_nvme == nvme);
ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
qp->nq_sqhead = cqe->cqe_sqhd;
- head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
+ qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
/* Toggle phase on wrap-around. */
if (qp->nq_cqhead == 0)
qp->nq_phase = qp->nq_phase ? 0 : 1;
+ taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
+ cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+ cnt_cmds++;
+ cqe = &qp->nq_cq[qp->nq_cqhead];
+ }
- nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
+ if (cnt_cmds != 0) {
+ head.b.cqhdbl_cqh = qp->nq_cqhead;
+ nvme_put32(nvme, qp->nq_cqhdbl, head.r);
+ if (nvme_enable_intr_monitoring)
+ qp->nq_ts = gethrtime();
+ }
- return (cmd);
+ mutex_exit(&qp->nq_mutex);
+
+ return (cnt_cmds);
}
static int
nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
{
@@ -1655,10 +1694,15 @@
return (DDI_FAILURE);
}
nvme_free_cmd(cmd);
+ mutex_enter(&nvme_global_mutex);
+ list_insert_head(&nvme_qp_list, qp);
+ qp->nq_nvme = nvme;
+ mutex_exit(&nvme_global_mutex);
+
return (DDI_SUCCESS);
}
static boolean_t
nvme_reset(nvme_t *nvme, boolean_t quiesce)
@@ -2182,10 +2226,11 @@
nvme_t *nvme = (nvme_t *)arg1;
int inum = (int)(uintptr_t)arg2;
int ccnt = 0;
int qnum;
nvme_cmd_t *cmd;
+ int cnt_cmds;
if (inum >= nvme->n_intr_cnt)
return (DDI_INTR_UNCLAIMED);
/*
@@ -2194,15 +2239,12 @@
* in steps of n_intr_cnt to process all queues using this vector.
*/
for (qnum = inum;
qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
qnum += nvme->n_intr_cnt) {
- while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
- taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
- cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
- ccnt++;
- }
+ cnt_cmds = nvme_process_cq_cmds(nvme, nvme->n_ioq[qnum]);
+ ccnt += cnt_cmds;
}
return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
}
@@ -2529,10 +2571,13 @@
if (nvme->n_ioq_count > 0) {
for (i = 1; i != nvme->n_ioq_count + 1; i++) {
if (nvme->n_ioq[i] != NULL) {
/* TODO: send destroy queue commands */
+ mutex_enter(&nvme_global_mutex);
+ list_remove(&nvme_qp_list, nvme->n_ioq[i]);
+ mutex_exit(&nvme_global_mutex);
nvme_free_qpair(nvme->n_ioq[i]);
}
}
kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
@@ -2842,5 +2887,28 @@
nvme_namespace_t *ns = arg;
return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
ns->ns_devid, devid));
}
+
+static void
+nvme_intr_monitor(void *arg)
+{
+ nvme_qpair_t *qp;
+ hrtime_t diff, now_ns;
+
+ if (!nvme_enable_intr_monitoring)
+ return;
+ mutex_enter(&nvme_global_mutex);
+ now_ns = gethrtime();
+ for (qp = list_head(&nvme_qp_list); qp != NULL;
+ qp = list_next(&nvme_qp_list, qp)) {
+ diff = now_ns - qp->nq_ts;
+ if (diff >= nvme_intr_timeout_ns && qp->nq_active_cmds > 0) {
+ if (nvme_process_cq_cmds(qp->nq_nvme, qp)) {
+ nvme_intr_timeouts++;
+ qp->nq_nvme->n_intr_timeouts++;
+ }
+ }
+ }
+ mutex_exit(&nvme_global_mutex);
+}