Print this page
7351 NVMe driver sporadically lost track of completed I/O request, which
leads to zpool hanging and machine panic.

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/nvme/nvme.c
          +++ new/usr/src/uts/common/io/nvme/nvme.c
↓ open down ↓ 174 lines elided ↑ open up ↑
 175  175  #include <sys/bitmap.h>
 176  176  #include <sys/sysmacros.h>
 177  177  #include <sys/param.h>
 178  178  #include <sys/varargs.h>
 179  179  #include <sys/cpuvar.h>
 180  180  #include <sys/disp.h>
 181  181  #include <sys/blkdev.h>
 182  182  #include <sys/atomic.h>
 183  183  #include <sys/archsystm.h>
 184  184  #include <sys/sata/sata_hba.h>
      185 +#include <sys/time.h>
 185  186  
 186  187  #include "nvme_reg.h"
 187  188  #include "nvme_var.h"
 188  189  
 189  190  
 190  191  /* NVMe spec version supported */
 191  192  static const int nvme_version_major = 1;
 192  193  static const int nvme_version_minor = 0;
 193  194  
 194  195  /* tunable for admin command timeout in seconds, default is 1s */
↓ open down ↓ 9 lines elided ↑ open up ↑
 204  205  
 205  206  static void nvme_shutdown(nvme_t *, int, boolean_t);
 206  207  static boolean_t nvme_reset(nvme_t *, boolean_t);
 207  208  static int nvme_init(nvme_t *);
 208  209  static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 209  210  static void nvme_free_cmd(nvme_cmd_t *);
 210  211  static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 211  212      bd_xfer_t *);
 212  213  static int nvme_admin_cmd(nvme_cmd_t *, int);
 213  214  static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
 214      -static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
      215 +static int nvme_process_cq_cmds(nvme_t *, nvme_qpair_t *);
 215  216  static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 216  217  static void nvme_wakeup_cmd(void *);
 217  218  static void nvme_async_event_task(void *);
 218  219  
 219  220  static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 220  221  static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 221  222  static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 222  223  static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 223  224  static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 224  225  static inline int nvme_check_cmd_status(nvme_cmd_t *);
↓ open down ↓ 27 lines elided ↑ open up ↑
 252  253  static void nvme_bd_driveinfo(void *, bd_drive_t *);
 253  254  static int nvme_bd_mediainfo(void *, bd_media_t *);
 254  255  static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
 255  256  static int nvme_bd_read(void *, bd_xfer_t *);
 256  257  static int nvme_bd_write(void *, bd_xfer_t *);
 257  258  static int nvme_bd_sync(void *, bd_xfer_t *);
 258  259  static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
 259  260  
 260  261  static void nvme_prepare_devid(nvme_t *, uint32_t);
 261  262  
      263 +static void nvme_intr_monitor(void *arg);
      264 +
 262  265  static void *nvme_state;
 263  266  static kmem_cache_t *nvme_cmd_cache;
 264  267  
      268 +static list_t nvme_qp_list;
      269 +static kmutex_t nvme_global_mutex;
      270 +static ddi_periodic_t nvme_cyclic;
      271 +int nvme_cyclic_seconds = 5;
      272 +hrtime_t nvme_intr_timeout_ns = 3 * NANOSEC;
      273 +uint64_t nvme_intr_timeouts = 0;
      274 +boolean_t nvme_enable_intr_monitoring = B_TRUE;
      275 +
 265  276  /*
 266  277   * DMA attributes for queue DMA memory
 267  278   *
 268  279   * Queue DMA memory must be page aligned. The maximum length of a queue is
 269  280   * 65536 entries, and an entry can be 64 bytes long.
 270  281   */
 271  282  static ddi_dma_attr_t nvme_queue_dma_attr = {
 272  283          .dma_attr_version       = DMA_ATTR_V0,
 273  284          .dma_attr_addr_lo       = 0,
 274  285          .dma_attr_addr_hi       = 0xffffffffffffffffULL,
↓ open down ↓ 100 lines elided ↑ open up ↑
 375  386  {
 376  387          int error;
 377  388  
 378  389          error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
 379  390          if (error != DDI_SUCCESS)
 380  391                  return (error);
 381  392  
 382  393          nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
 383  394              sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 384  395  
      396 +        mutex_init(&nvme_global_mutex, NULL, MUTEX_DRIVER, 0);
      397 +
      398 +        list_create(&nvme_qp_list, sizeof (nvme_qpair_t),
      399 +            offsetof(nvme_qpair_t, nq_list_node));
      400 +
      401 +        nvme_cyclic = ddi_periodic_add(nvme_intr_monitor, NULL,
      402 +            NANOSEC * nvme_cyclic_seconds, DDI_IPL_0);
      403 +
 385  404          bd_mod_init(&nvme_dev_ops);
 386  405  
 387  406          error = mod_install(&nvme_modlinkage);
 388  407          if (error != DDI_SUCCESS) {
 389  408                  ddi_soft_state_fini(&nvme_state);
 390  409                  bd_mod_fini(&nvme_dev_ops);
 391  410          }
 392  411  
 393  412          return (error);
 394  413  }
 395  414  
 396  415  int
 397  416  _fini(void)
 398  417  {
 399  418          int error;
 400  419  
 401  420          error = mod_remove(&nvme_modlinkage);
 402  421          if (error == DDI_SUCCESS) {
 403  422                  ddi_soft_state_fini(&nvme_state);
 404  423                  kmem_cache_destroy(nvme_cmd_cache);
      424 +                if (nvme_cyclic != NULL) {
      425 +                        ddi_periodic_delete(nvme_cyclic);
      426 +                        nvme_cyclic = NULL;
      427 +                }
      428 +                mutex_destroy(&nvme_global_mutex);
 405  429                  bd_mod_fini(&nvme_dev_ops);
 406  430          }
 407  431  
 408  432          return (error);
 409  433  }
 410  434  
 411  435  int
 412  436  _info(struct modinfo *modinfop)
 413  437  {
 414  438          return (mod_info(&nvme_modlinkage, modinfop));
↓ open down ↓ 284 lines elided ↑ open up ↑
 699  723          cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
 700  724          bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
 701  725          (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
 702  726              sizeof (nvme_sqe_t) * qp->nq_sqtail,
 703  727              sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
 704  728          qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 705  729  
 706  730          tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
 707  731          nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 708  732  
      733 +        if (nvme_enable_intr_monitoring)
      734 +                qp->nq_ts = gethrtime();
 709  735          mutex_exit(&qp->nq_mutex);
      736 +
 710  737          return (DDI_SUCCESS);
 711  738  }
 712  739  
 713      -static nvme_cmd_t *
 714      -nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
      740 +static int
      741 +nvme_process_cq_cmds(nvme_t *nvme, nvme_qpair_t *qp)
 715  742  {
 716  743          nvme_reg_cqhdbl_t head = { 0 };
 717  744  
 718  745          nvme_cqe_t *cqe;
 719  746          nvme_cmd_t *cmd;
      747 +        int cnt_cmds = 0;
 720  748  
 721  749          (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
 722  750              sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
 723  751  
 724  752          cqe = &qp->nq_cq[qp->nq_cqhead];
 725      -
 726  753          /* Check phase tag of CQE. Hardware inverts it for new entries. */
 727  754          if (cqe->cqe_sf.sf_p == qp->nq_phase)
 728      -                return (NULL);
 729      -
 730      -        ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
 731      -        ASSERT(cqe->cqe_cid < qp->nq_nentry);
      755 +                return (cnt_cmds);
 732  756  
 733  757          mutex_enter(&qp->nq_mutex);
 734      -        cmd = qp->nq_cmd[cqe->cqe_cid];
 735      -        qp->nq_cmd[cqe->cqe_cid] = NULL;
 736      -        qp->nq_active_cmds--;
 737      -        mutex_exit(&qp->nq_mutex);
 738      -
 739      -        ASSERT(cmd != NULL);
 740      -        ASSERT(cmd->nc_nvme == nvme);
 741      -        ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
 742      -        ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
 743      -        bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 744      -
 745      -        qp->nq_sqhead = cqe->cqe_sqhd;
 746      -
 747      -        head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 748      -
 749      -        /* Toggle phase on wrap-around. */
 750      -        if (qp->nq_cqhead == 0)
 751      -                qp->nq_phase = qp->nq_phase ? 0 : 1;
      758 +        while (cqe->cqe_sf.sf_p != qp->nq_phase) {
      759 +                ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
      760 +                ASSERT(cqe->cqe_cid < qp->nq_nentry);
      761 +
      762 +                cmd = qp->nq_cmd[cqe->cqe_cid];
      763 +                qp->nq_cmd[cqe->cqe_cid] = NULL;
      764 +                qp->nq_active_cmds--;
      765 +
      766 +                ASSERT(cmd != NULL);
      767 +                ASSERT(cmd->nc_nvme == nvme);
      768 +                ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
      769 +                ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
      770 +                bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
      771 +
      772 +                qp->nq_sqhead = cqe->cqe_sqhd;
      773 +
      774 +                qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
      775 +
      776 +                /* Toggle phase on wrap-around. */
      777 +                if (qp->nq_cqhead == 0)
      778 +                        qp->nq_phase = qp->nq_phase ? 0 : 1;
      779 +                taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
      780 +                     cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
      781 +                cnt_cmds++;
      782 +                cqe = &qp->nq_cq[qp->nq_cqhead];
      783 +        }
      784 +
      785 +        if (cnt_cmds != 0) {
      786 +                head.b.cqhdbl_cqh = qp->nq_cqhead;
      787 +                nvme_put32(nvme, qp->nq_cqhdbl, head.r);
      788 +                if (nvme_enable_intr_monitoring)
      789 +                        qp->nq_ts = gethrtime();
      790 +        }
 752  791  
 753      -        nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
      792 +        mutex_exit(&qp->nq_mutex);
 754  793  
 755      -        return (cmd);
      794 +        return (cnt_cmds);
 756  795  }
 757  796  
 758  797  static int
 759  798  nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 760  799  {
 761  800          nvme_cqe_t *cqe = &cmd->nc_cqe;
 762  801  
 763  802          dev_err(cmd->nc_nvme->n_dip, CE_WARN,
 764  803              "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
 765  804              "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
↓ open down ↓ 884 lines elided ↑ open up ↑
1650 1689          if (nvme_check_cmd_status(cmd)) {
1651 1690                  dev_err(nvme->n_dip, CE_WARN,
1652 1691                      "!CREATE SQUEUE failed with sct = %x, sc = %x",
1653 1692                      cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1654 1693                  nvme_free_cmd(cmd);
1655 1694                  return (DDI_FAILURE);
1656 1695          }
1657 1696  
1658 1697          nvme_free_cmd(cmd);
1659 1698  
     1699 +        mutex_enter(&nvme_global_mutex);
     1700 +        list_insert_head(&nvme_qp_list, qp);
     1701 +        qp->nq_nvme = nvme;
     1702 +        mutex_exit(&nvme_global_mutex);
     1703 +
1660 1704          return (DDI_SUCCESS);
1661 1705  }
1662 1706  
1663 1707  static boolean_t
1664 1708  nvme_reset(nvme_t *nvme, boolean_t quiesce)
1665 1709  {
1666 1710          nvme_reg_csts_t csts;
1667 1711          int i;
1668 1712  
1669 1713          nvme_put32(nvme, NVME_REG_CC, 0);
↓ open down ↓ 507 lines elided ↑ open up ↑
2177 2221  
2178 2222  static uint_t
2179 2223  nvme_intr(caddr_t arg1, caddr_t arg2)
2180 2224  {
2181 2225          /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2182 2226          nvme_t *nvme = (nvme_t *)arg1;
2183 2227          int inum = (int)(uintptr_t)arg2;
2184 2228          int ccnt = 0;
2185 2229          int qnum;
2186 2230          nvme_cmd_t *cmd;
     2231 +        int cnt_cmds;
2187 2232  
2188 2233          if (inum >= nvme->n_intr_cnt)
2189 2234                  return (DDI_INTR_UNCLAIMED);
2190 2235  
2191 2236          /*
2192 2237           * The interrupt vector a queue uses is calculated as queue_idx %
2193 2238           * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2194 2239           * in steps of n_intr_cnt to process all queues using this vector.
2195 2240           */
2196 2241          for (qnum = inum;
2197 2242              qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
2198 2243              qnum += nvme->n_intr_cnt) {
2199      -                while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
2200      -                        taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
2201      -                            cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
2202      -                        ccnt++;
2203      -                }
     2244 +                cnt_cmds =  nvme_process_cq_cmds(nvme, nvme->n_ioq[qnum]);
     2245 +                ccnt += cnt_cmds;
2204 2246          }
2205 2247  
2206 2248          return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
2207 2249  }
2208 2250  
2209 2251  static void
2210 2252  nvme_release_interrupts(nvme_t *nvme)
2211 2253  {
2212 2254          int i;
2213 2255  
↓ open down ↓ 310 lines elided ↑ open up ↑
2524 2566          if (nvme->n_progress & NVME_INTERRUPTS)
2525 2567                  nvme_release_interrupts(nvme);
2526 2568  
2527 2569          if (nvme->n_cmd_taskq)
2528 2570                  ddi_taskq_wait(nvme->n_cmd_taskq);
2529 2571  
2530 2572          if (nvme->n_ioq_count > 0) {
2531 2573                  for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2532 2574                          if (nvme->n_ioq[i] != NULL) {
2533 2575                                  /* TODO: send destroy queue commands */
     2576 +                                mutex_enter(&nvme_global_mutex);
     2577 +                                list_remove(&nvme_qp_list, nvme->n_ioq[i]);
     2578 +                                mutex_exit(&nvme_global_mutex);
2534 2579                                  nvme_free_qpair(nvme->n_ioq[i]);
2535 2580                          }
2536 2581                  }
2537 2582  
2538 2583                  kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
2539 2584                      (nvme->n_ioq_count + 1));
2540 2585          }
2541 2586  
2542 2587          if (nvme->n_progress & NVME_REGS_MAPPED) {
2543 2588                  nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
↓ open down ↓ 292 lines elided ↑ open up ↑
2836 2881          return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
2837 2882  }
2838 2883  
2839 2884  static int
2840 2885  nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
2841 2886  {
2842 2887          nvme_namespace_t *ns = arg;
2843 2888  
2844 2889          return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
2845 2890              ns->ns_devid, devid));
     2891 +}
     2892 +
     2893 +static void
     2894 +nvme_intr_monitor(void *arg)
     2895 +{
     2896 +        nvme_qpair_t *qp;
     2897 +        hrtime_t diff, now_ns;
     2898 +
     2899 +        if (!nvme_enable_intr_monitoring)
     2900 +                return;
     2901 +        mutex_enter(&nvme_global_mutex);
     2902 +        now_ns = gethrtime();
     2903 +        for (qp = list_head(&nvme_qp_list); qp != NULL;
     2904 +            qp = list_next(&nvme_qp_list, qp)) {
     2905 +                diff = now_ns - qp->nq_ts;
     2906 +                if (diff >= nvme_intr_timeout_ns && qp->nq_active_cmds > 0) {
     2907 +                        if (nvme_process_cq_cmds(qp->nq_nvme, qp)) {
     2908 +                                nvme_intr_timeouts++;
     2909 +                                qp->nq_nvme->n_intr_timeouts++;
     2910 +                        }
     2911 +                }
     2912 +        }
     2913 +        mutex_exit(&nvme_global_mutex);
2846 2914  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX