Print this page
7351 NVMe driver sporadically lost track of completed I/O request, which
leads to zpool hanging and machine panic.

*** 180,189 **** --- 180,190 ---- #include <sys/disp.h> #include <sys/blkdev.h> #include <sys/atomic.h> #include <sys/archsystm.h> #include <sys/sata/sata_hba.h> + #include <sys/time.h> #include "nvme_reg.h" #include "nvme_var.h"
*** 209,219 **** static void nvme_free_cmd(nvme_cmd_t *); static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, bd_xfer_t *); static int nvme_admin_cmd(nvme_cmd_t *, int); static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *); ! static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t); static void nvme_wakeup_cmd(void *); static void nvme_async_event_task(void *); static int nvme_check_unknown_cmd_status(nvme_cmd_t *); --- 210,220 ---- static void nvme_free_cmd(nvme_cmd_t *); static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, bd_xfer_t *); static int nvme_admin_cmd(nvme_cmd_t *, int); static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *); ! static int nvme_process_cq_cmds(nvme_t *, nvme_qpair_t *); static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t); static void nvme_wakeup_cmd(void *); static void nvme_async_event_task(void *); static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
*** 257,269 **** --- 258,280 ---- static int nvme_bd_sync(void *, bd_xfer_t *); static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); static void nvme_prepare_devid(nvme_t *, uint32_t); + static void nvme_intr_monitor(void *arg); + static void *nvme_state; static kmem_cache_t *nvme_cmd_cache; + static list_t nvme_qp_list; + static kmutex_t nvme_global_mutex; + static ddi_periodic_t nvme_cyclic; + int nvme_cyclic_seconds = 5; + hrtime_t nvme_intr_timeout_ns = 3 * NANOSEC; + uint64_t nvme_intr_timeouts = 0; + boolean_t nvme_enable_intr_monitoring = B_TRUE; + /* * DMA attributes for queue DMA memory * * Queue DMA memory must be page aligned. The maximum length of a queue is * 65536 entries, and an entry can be 64 bytes long.
*** 380,389 **** --- 391,408 ---- return (error); nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); + mutex_init(&nvme_global_mutex, NULL, MUTEX_DRIVER, 0); + + list_create(&nvme_qp_list, sizeof (nvme_qpair_t), + offsetof(nvme_qpair_t, nq_list_node)); + + nvme_cyclic = ddi_periodic_add(nvme_intr_monitor, NULL, + NANOSEC * nvme_cyclic_seconds, DDI_IPL_0); + bd_mod_init(&nvme_dev_ops); error = mod_install(&nvme_modlinkage); if (error != DDI_SUCCESS) { ddi_soft_state_fini(&nvme_state);
*** 400,409 **** --- 419,433 ---- error = mod_remove(&nvme_modlinkage); if (error == DDI_SUCCESS) { ddi_soft_state_fini(&nvme_state); kmem_cache_destroy(nvme_cmd_cache); + if (nvme_cyclic != NULL) { + ddi_periodic_delete(nvme_cyclic); + nvme_cyclic = NULL; + } + mutex_destroy(&nvme_global_mutex); bd_mod_fini(&nvme_dev_ops); } return (error); }
*** 704,760 **** qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); mutex_exit(&qp->nq_mutex); return (DDI_SUCCESS); } ! static nvme_cmd_t * ! nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) { nvme_reg_cqhdbl_t head = { 0 }; nvme_cqe_t *cqe; nvme_cmd_t *cmd; (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0, sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL); cqe = &qp->nq_cq[qp->nq_cqhead]; - /* Check phase tag of CQE. Hardware inverts it for new entries. */ if (cqe->cqe_sf.sf_p == qp->nq_phase) ! return (NULL); ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp); ASSERT(cqe->cqe_cid < qp->nq_nentry); - mutex_enter(&qp->nq_mutex); cmd = qp->nq_cmd[cqe->cqe_cid]; qp->nq_cmd[cqe->cqe_cid] = NULL; qp->nq_active_cmds--; - mutex_exit(&qp->nq_mutex); ASSERT(cmd != NULL); ASSERT(cmd->nc_nvme == nvme); ASSERT(cmd->nc_sqid == cqe->cqe_sqid); ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid); bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); qp->nq_sqhead = cqe->cqe_sqhd; ! head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry; /* Toggle phase on wrap-around. */ if (qp->nq_cqhead == 0) qp->nq_phase = qp->nq_phase ? 0 : 1; ! nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r); ! return (cmd); } static int nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) { --- 728,799 ---- qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); + if (nvme_enable_intr_monitoring) + qp->nq_ts = gethrtime(); mutex_exit(&qp->nq_mutex); + return (DDI_SUCCESS); } ! static int ! nvme_process_cq_cmds(nvme_t *nvme, nvme_qpair_t *qp) { nvme_reg_cqhdbl_t head = { 0 }; nvme_cqe_t *cqe; nvme_cmd_t *cmd; + int cnt_cmds = 0; (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0, sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL); cqe = &qp->nq_cq[qp->nq_cqhead]; /* Check phase tag of CQE. Hardware inverts it for new entries. */ if (cqe->cqe_sf.sf_p == qp->nq_phase) ! return (cnt_cmds); + mutex_enter(&qp->nq_mutex); + while (cqe->cqe_sf.sf_p != qp->nq_phase) { ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp); ASSERT(cqe->cqe_cid < qp->nq_nentry); cmd = qp->nq_cmd[cqe->cqe_cid]; qp->nq_cmd[cqe->cqe_cid] = NULL; qp->nq_active_cmds--; ASSERT(cmd != NULL); ASSERT(cmd->nc_nvme == nvme); ASSERT(cmd->nc_sqid == cqe->cqe_sqid); ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid); bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); qp->nq_sqhead = cqe->cqe_sqhd; ! qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry; /* Toggle phase on wrap-around. */ if (qp->nq_cqhead == 0) qp->nq_phase = qp->nq_phase ? 0 : 1; + taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, + cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); + cnt_cmds++; + cqe = &qp->nq_cq[qp->nq_cqhead]; + } ! if (cnt_cmds != 0) { ! head.b.cqhdbl_cqh = qp->nq_cqhead; ! nvme_put32(nvme, qp->nq_cqhdbl, head.r); ! if (nvme_enable_intr_monitoring) ! qp->nq_ts = gethrtime(); ! } ! mutex_exit(&qp->nq_mutex); ! ! return (cnt_cmds); } static int nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) {
*** 1655,1664 **** --- 1694,1708 ---- return (DDI_FAILURE); } nvme_free_cmd(cmd); + mutex_enter(&nvme_global_mutex); + list_insert_head(&nvme_qp_list, qp); + qp->nq_nvme = nvme; + mutex_exit(&nvme_global_mutex); + return (DDI_SUCCESS); } static boolean_t nvme_reset(nvme_t *nvme, boolean_t quiesce)
*** 2182,2191 **** --- 2226,2236 ---- nvme_t *nvme = (nvme_t *)arg1; int inum = (int)(uintptr_t)arg2; int ccnt = 0; int qnum; nvme_cmd_t *cmd; + int cnt_cmds; if (inum >= nvme->n_intr_cnt) return (DDI_INTR_UNCLAIMED); /*
*** 2194,2208 **** * in steps of n_intr_cnt to process all queues using this vector. */ for (qnum = inum; qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL; qnum += nvme->n_intr_cnt) { ! while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) { ! taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, ! cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); ! ccnt++; ! } } return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); } --- 2239,2250 ---- * in steps of n_intr_cnt to process all queues using this vector. */ for (qnum = inum; qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL; qnum += nvme->n_intr_cnt) { ! cnt_cmds = nvme_process_cq_cmds(nvme, nvme->n_ioq[qnum]); ! ccnt += cnt_cmds; } return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); }
*** 2529,2538 **** --- 2571,2583 ---- if (nvme->n_ioq_count > 0) { for (i = 1; i != nvme->n_ioq_count + 1; i++) { if (nvme->n_ioq[i] != NULL) { /* TODO: send destroy queue commands */ + mutex_enter(&nvme_global_mutex); + list_remove(&nvme_qp_list, nvme->n_ioq[i]); + mutex_exit(&nvme_global_mutex); nvme_free_qpair(nvme->n_ioq[i]); } } kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
*** 2842,2846 **** --- 2887,2914 ---- nvme_namespace_t *ns = arg; return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid), ns->ns_devid, devid)); } + + static void + nvme_intr_monitor(void *arg) + { + nvme_qpair_t *qp; + hrtime_t diff, now_ns; + + if (!nvme_enable_intr_monitoring) + return; + mutex_enter(&nvme_global_mutex); + now_ns = gethrtime(); + for (qp = list_head(&nvme_qp_list); qp != NULL; + qp = list_next(&nvme_qp_list, qp)) { + diff = now_ns - qp->nq_ts; + if (diff >= nvme_intr_timeout_ns && qp->nq_active_cmds > 0) { + if (nvme_process_cq_cmds(qp->nq_nvme, qp)) { + nvme_intr_timeouts++; + qp->nq_nvme->n_intr_timeouts++; + } + } + } + mutex_exit(&nvme_global_mutex); + }