Print this page
7351 NVMe driver sporadically lost track of completed I/O request, which
leads to zpool hanging and machine panic.


 165 #include <sys/byteorder.h>
 166 #ifdef _BIG_ENDIAN
 167 #error nvme driver needs porting for big-endian platforms
 168 #endif
 169 
 170 #include <sys/modctl.h>
 171 #include <sys/conf.h>
 172 #include <sys/devops.h>
 173 #include <sys/ddi.h>
 174 #include <sys/sunddi.h>
 175 #include <sys/bitmap.h>
 176 #include <sys/sysmacros.h>
 177 #include <sys/param.h>
 178 #include <sys/varargs.h>
 179 #include <sys/cpuvar.h>
 180 #include <sys/disp.h>
 181 #include <sys/blkdev.h>
 182 #include <sys/atomic.h>
 183 #include <sys/archsystm.h>
 184 #include <sys/sata/sata_hba.h>

 185 
 186 #include "nvme_reg.h"
 187 #include "nvme_var.h"
 188 
 189 
 190 /* NVMe spec version supported */
 191 static const int nvme_version_major = 1;
 192 static const int nvme_version_minor = 0;
 193 
 194 /* tunable for admin command timeout in seconds, default is 1s */
 195 static volatile int nvme_admin_cmd_timeout = 1;
 196 
 197 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 198 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 199 static int nvme_quiesce(dev_info_t *);
 200 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 201 static int nvme_setup_interrupts(nvme_t *, int, int);
 202 static void nvme_release_interrupts(nvme_t *);
 203 static uint_t nvme_intr(caddr_t, caddr_t);
 204 
 205 static void nvme_shutdown(nvme_t *, int, boolean_t);
 206 static boolean_t nvme_reset(nvme_t *, boolean_t);
 207 static int nvme_init(nvme_t *);
 208 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 209 static void nvme_free_cmd(nvme_cmd_t *);
 210 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 211     bd_xfer_t *);
 212 static int nvme_admin_cmd(nvme_cmd_t *, int);
 213 static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
 214 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
 215 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 216 static void nvme_wakeup_cmd(void *);
 217 static void nvme_async_event_task(void *);
 218 
 219 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 220 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 221 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 222 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 223 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 224 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 225 
 226 static void nvme_abort_cmd(nvme_cmd_t *);
 227 static int nvme_async_event(nvme_t *);
 228 static void *nvme_get_logpage(nvme_t *, uint8_t, ...);
 229 static void *nvme_identify(nvme_t *, uint32_t);
 230 static int nvme_set_nqueues(nvme_t *, uint16_t);
 231 
 232 static void nvme_free_dma(nvme_dma_t *);
 233 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
 234     nvme_dma_t **);


 242 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
 243 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
 244 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
 245 
 246 static boolean_t nvme_check_regs_hdl(nvme_t *);
 247 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
 248 
 249 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
 250 
 251 static void nvme_bd_xfer_done(void *);
 252 static void nvme_bd_driveinfo(void *, bd_drive_t *);
 253 static int nvme_bd_mediainfo(void *, bd_media_t *);
 254 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
 255 static int nvme_bd_read(void *, bd_xfer_t *);
 256 static int nvme_bd_write(void *, bd_xfer_t *);
 257 static int nvme_bd_sync(void *, bd_xfer_t *);
 258 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
 259 
 260 static void nvme_prepare_devid(nvme_t *, uint32_t);
 261 


 262 static void *nvme_state;
 263 static kmem_cache_t *nvme_cmd_cache;
 264 








 265 /*
 266  * DMA attributes for queue DMA memory
 267  *
 268  * Queue DMA memory must be page aligned. The maximum length of a queue is
 269  * 65536 entries, and an entry can be 64 bytes long.
 270  */
 271 static ddi_dma_attr_t nvme_queue_dma_attr = {
 272         .dma_attr_version       = DMA_ATTR_V0,
 273         .dma_attr_addr_lo       = 0,
 274         .dma_attr_addr_hi       = 0xffffffffffffffffULL,
 275         .dma_attr_count_max     = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
 276         .dma_attr_align         = 0x1000,
 277         .dma_attr_burstsizes    = 0x7ff,
 278         .dma_attr_minxfer       = 0x1000,
 279         .dma_attr_maxxfer       = (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
 280         .dma_attr_seg           = 0xffffffffffffffffULL,
 281         .dma_attr_sgllen        = 1,
 282         .dma_attr_granular      = 1,
 283         .dma_attr_flags         = 0,
 284 };


 365         .o_drive_info   = nvme_bd_driveinfo,
 366         .o_media_info   = nvme_bd_mediainfo,
 367         .o_devid_init   = nvme_bd_devid,
 368         .o_sync_cache   = nvme_bd_sync,
 369         .o_read         = nvme_bd_read,
 370         .o_write        = nvme_bd_write,
 371 };
 372 
 373 int
 374 _init(void)
 375 {
 376         int error;
 377 
 378         error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
 379         if (error != DDI_SUCCESS)
 380                 return (error);
 381 
 382         nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
 383             sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 384 








 385         bd_mod_init(&nvme_dev_ops);
 386 
 387         error = mod_install(&nvme_modlinkage);
 388         if (error != DDI_SUCCESS) {
 389                 ddi_soft_state_fini(&nvme_state);
 390                 bd_mod_fini(&nvme_dev_ops);
 391         }
 392 
 393         return (error);
 394 }
 395 
 396 int
 397 _fini(void)
 398 {
 399         int error;
 400 
 401         error = mod_remove(&nvme_modlinkage);
 402         if (error == DDI_SUCCESS) {
 403                 ddi_soft_state_fini(&nvme_state);
 404                 kmem_cache_destroy(nvme_cmd_cache);





 405                 bd_mod_fini(&nvme_dev_ops);
 406         }
 407 
 408         return (error);
 409 }
 410 
 411 int
 412 _info(struct modinfo *modinfop)
 413 {
 414         return (mod_info(&nvme_modlinkage, modinfop));
 415 }
 416 
 417 static inline void
 418 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
 419 {
 420         ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
 421 
 422         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
 423         ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
 424 }


 689          * slot. If the slot is already occupied advance to the next slot and
 690          * try again. This can happen for long running commands like async event
 691          * requests.
 692          */
 693         while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
 694                 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 695         qp->nq_cmd[qp->nq_next_cmd] = cmd;
 696 
 697         qp->nq_active_cmds++;
 698 
 699         cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
 700         bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
 701         (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
 702             sizeof (nvme_sqe_t) * qp->nq_sqtail,
 703             sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
 704         qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 705 
 706         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
 707         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 708 


 709         mutex_exit(&qp->nq_mutex);

 710         return (DDI_SUCCESS);
 711 }
 712 
 713 static nvme_cmd_t *
 714 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
 715 {
 716         nvme_reg_cqhdbl_t head = { 0 };
 717 
 718         nvme_cqe_t *cqe;
 719         nvme_cmd_t *cmd;

 720 
 721         (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
 722             sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
 723 
 724         cqe = &qp->nq_cq[qp->nq_cqhead];
 725 
 726         /* Check phase tag of CQE. Hardware inverts it for new entries. */
 727         if (cqe->cqe_sf.sf_p == qp->nq_phase)
 728                 return (NULL);
 729 


 730         ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
 731         ASSERT(cqe->cqe_cid < qp->nq_nentry);
 732 
 733         mutex_enter(&qp->nq_mutex);
 734         cmd = qp->nq_cmd[cqe->cqe_cid];
 735         qp->nq_cmd[cqe->cqe_cid] = NULL;
 736         qp->nq_active_cmds--;
 737         mutex_exit(&qp->nq_mutex);
 738 
 739         ASSERT(cmd != NULL);
 740         ASSERT(cmd->nc_nvme == nvme);
 741         ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
 742         ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
 743         bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 744 
 745         qp->nq_sqhead = cqe->cqe_sqhd;
 746 
 747         head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 748 
 749         /* Toggle phase on wrap-around. */
 750         if (qp->nq_cqhead == 0)
 751                 qp->nq_phase = qp->nq_phase ? 0 : 1;





 752 
 753         nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);





 754 
 755         return (cmd);


 756 }
 757 
 758 static int
 759 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 760 {
 761         nvme_cqe_t *cqe = &cmd->nc_cqe;
 762 
 763         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
 764             "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
 765             "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
 766             cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
 767             cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
 768 
 769         bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
 770 
 771         if (cmd->nc_nvme->n_strict_version) {
 772                 cmd->nc_nvme->n_dead = B_TRUE;
 773                 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
 774         }
 775 


1640         cmd->nc_sqe.sqe_cdw10 = dw10.r;
1641         cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
1642         cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
1643 
1644         if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1645                 dev_err(nvme->n_dip, CE_WARN,
1646                     "!nvme_admin_cmd failed for CREATE SQUEUE");
1647                 return (DDI_FAILURE);
1648         }
1649 
1650         if (nvme_check_cmd_status(cmd)) {
1651                 dev_err(nvme->n_dip, CE_WARN,
1652                     "!CREATE SQUEUE failed with sct = %x, sc = %x",
1653                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1654                 nvme_free_cmd(cmd);
1655                 return (DDI_FAILURE);
1656         }
1657 
1658         nvme_free_cmd(cmd);
1659 





1660         return (DDI_SUCCESS);
1661 }
1662 
1663 static boolean_t
1664 nvme_reset(nvme_t *nvme, boolean_t quiesce)
1665 {
1666         nvme_reg_csts_t csts;
1667         int i;
1668 
1669         nvme_put32(nvme, NVME_REG_CC, 0);
1670 
1671         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1672         if (csts.b.csts_rdy == 1) {
1673                 nvme_put32(nvme, NVME_REG_CC, 0);
1674                 for (i = 0; i != nvme->n_timeout * 10; i++) {
1675                         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1676                         if (csts.b.csts_rdy == 0)
1677                                 break;
1678 
1679                         if (quiesce)


2167                         goto fail;
2168                 }
2169         }
2170 
2171         return (DDI_SUCCESS);
2172 
2173 fail:
2174         (void) nvme_reset(nvme, B_FALSE);
2175         return (DDI_FAILURE);
2176 }
2177 
2178 static uint_t
2179 nvme_intr(caddr_t arg1, caddr_t arg2)
2180 {
2181         /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2182         nvme_t *nvme = (nvme_t *)arg1;
2183         int inum = (int)(uintptr_t)arg2;
2184         int ccnt = 0;
2185         int qnum;
2186         nvme_cmd_t *cmd;

2187 
2188         if (inum >= nvme->n_intr_cnt)
2189                 return (DDI_INTR_UNCLAIMED);
2190 
2191         /*
2192          * The interrupt vector a queue uses is calculated as queue_idx %
2193          * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2194          * in steps of n_intr_cnt to process all queues using this vector.
2195          */
2196         for (qnum = inum;
2197             qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
2198             qnum += nvme->n_intr_cnt) {
2199                 while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
2200                         taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
2201                             cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
2202                         ccnt++;
2203                 }
2204         }
2205 
2206         return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
2207 }
2208 
2209 static void
2210 nvme_release_interrupts(nvme_t *nvme)
2211 {
2212         int i;
2213 
2214         for (i = 0; i < nvme->n_intr_cnt; i++) {
2215                 if (nvme->n_inth[i] == NULL)
2216                         break;
2217 
2218                 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
2219                         (void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
2220                 else
2221                         (void) ddi_intr_disable(nvme->n_inth[i]);
2222 
2223                 (void) ddi_intr_remove_handler(nvme->n_inth[i]);


2514 
2515                         if (nvme->n_ns[i].ns_idns)
2516                                 kmem_free(nvme->n_ns[i].ns_idns,
2517                                     sizeof (nvme_identify_nsid_t));
2518                 }
2519 
2520                 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
2521                     nvme->n_namespace_count);
2522         }
2523 
2524         if (nvme->n_progress & NVME_INTERRUPTS)
2525                 nvme_release_interrupts(nvme);
2526 
2527         if (nvme->n_cmd_taskq)
2528                 ddi_taskq_wait(nvme->n_cmd_taskq);
2529 
2530         if (nvme->n_ioq_count > 0) {
2531                 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2532                         if (nvme->n_ioq[i] != NULL) {
2533                                 /* TODO: send destroy queue commands */



2534                                 nvme_free_qpair(nvme->n_ioq[i]);
2535                         }
2536                 }
2537 
2538                 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
2539                     (nvme->n_ioq_count + 1));
2540         }
2541 
2542         if (nvme->n_progress & NVME_REGS_MAPPED) {
2543                 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
2544                 (void) nvme_reset(nvme, B_FALSE);
2545         }
2546 
2547         if (nvme->n_cmd_taskq)
2548                 ddi_taskq_destroy(nvme->n_cmd_taskq);
2549 
2550         if (nvme->n_progress & NVME_CTRL_LIMITS)
2551                 sema_destroy(&nvme->n_abort_sema);
2552 
2553         if (nvme->n_progress & NVME_ADMIN_QUEUE)


2826 
2827         /*
2828          * If the volatile write cache isn't enabled the FLUSH command is a
2829          * no-op, so we can take a shortcut here.
2830          */
2831         if (ns->ns_nvme->n_volatile_write_cache_enabled == B_FALSE) {
2832                 bd_xfer_done(xfer, ENOTSUP);
2833                 return (0);
2834         }
2835 
2836         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
2837 }
2838 
2839 static int
2840 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
2841 {
2842         nvme_namespace_t *ns = arg;
2843 
2844         return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
2845             ns->ns_devid, devid));























2846 }


 165 #include <sys/byteorder.h>
 166 #ifdef _BIG_ENDIAN
 167 #error nvme driver needs porting for big-endian platforms
 168 #endif
 169 
 170 #include <sys/modctl.h>
 171 #include <sys/conf.h>
 172 #include <sys/devops.h>
 173 #include <sys/ddi.h>
 174 #include <sys/sunddi.h>
 175 #include <sys/bitmap.h>
 176 #include <sys/sysmacros.h>
 177 #include <sys/param.h>
 178 #include <sys/varargs.h>
 179 #include <sys/cpuvar.h>
 180 #include <sys/disp.h>
 181 #include <sys/blkdev.h>
 182 #include <sys/atomic.h>
 183 #include <sys/archsystm.h>
 184 #include <sys/sata/sata_hba.h>
 185 #include <sys/time.h>
 186 
 187 #include "nvme_reg.h"
 188 #include "nvme_var.h"
 189 
 190 
 191 /* NVMe spec version supported */
 192 static const int nvme_version_major = 1;
 193 static const int nvme_version_minor = 0;
 194 
 195 /* tunable for admin command timeout in seconds, default is 1s */
 196 static volatile int nvme_admin_cmd_timeout = 1;
 197 
 198 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 199 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 200 static int nvme_quiesce(dev_info_t *);
 201 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 202 static int nvme_setup_interrupts(nvme_t *, int, int);
 203 static void nvme_release_interrupts(nvme_t *);
 204 static uint_t nvme_intr(caddr_t, caddr_t);
 205 
 206 static void nvme_shutdown(nvme_t *, int, boolean_t);
 207 static boolean_t nvme_reset(nvme_t *, boolean_t);
 208 static int nvme_init(nvme_t *);
 209 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 210 static void nvme_free_cmd(nvme_cmd_t *);
 211 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 212     bd_xfer_t *);
 213 static int nvme_admin_cmd(nvme_cmd_t *, int);
 214 static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
 215 static int nvme_process_cq_cmds(nvme_t *, nvme_qpair_t *);
 216 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 217 static void nvme_wakeup_cmd(void *);
 218 static void nvme_async_event_task(void *);
 219 
 220 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 221 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 222 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 223 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 224 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 225 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 226 
 227 static void nvme_abort_cmd(nvme_cmd_t *);
 228 static int nvme_async_event(nvme_t *);
 229 static void *nvme_get_logpage(nvme_t *, uint8_t, ...);
 230 static void *nvme_identify(nvme_t *, uint32_t);
 231 static int nvme_set_nqueues(nvme_t *, uint16_t);
 232 
 233 static void nvme_free_dma(nvme_dma_t *);
 234 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
 235     nvme_dma_t **);


 243 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
 244 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
 245 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
 246 
 247 static boolean_t nvme_check_regs_hdl(nvme_t *);
 248 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
 249 
 250 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
 251 
 252 static void nvme_bd_xfer_done(void *);
 253 static void nvme_bd_driveinfo(void *, bd_drive_t *);
 254 static int nvme_bd_mediainfo(void *, bd_media_t *);
 255 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
 256 static int nvme_bd_read(void *, bd_xfer_t *);
 257 static int nvme_bd_write(void *, bd_xfer_t *);
 258 static int nvme_bd_sync(void *, bd_xfer_t *);
 259 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
 260 
 261 static void nvme_prepare_devid(nvme_t *, uint32_t);
 262 
 263 static void nvme_intr_monitor(void *arg);
 264 
 265 static void *nvme_state;
 266 static kmem_cache_t *nvme_cmd_cache;
 267 
 268 static list_t nvme_qp_list;
 269 static kmutex_t nvme_global_mutex;
 270 static ddi_periodic_t nvme_cyclic;
 271 int nvme_cyclic_seconds = 5;
 272 hrtime_t nvme_intr_timeout_ns = 3 * NANOSEC;
 273 uint64_t nvme_intr_timeouts = 0;
 274 boolean_t nvme_enable_intr_monitoring = B_TRUE;
 275 
 276 /*
 277  * DMA attributes for queue DMA memory
 278  *
 279  * Queue DMA memory must be page aligned. The maximum length of a queue is
 280  * 65536 entries, and an entry can be 64 bytes long.
 281  */
 282 static ddi_dma_attr_t nvme_queue_dma_attr = {
 283         .dma_attr_version       = DMA_ATTR_V0,
 284         .dma_attr_addr_lo       = 0,
 285         .dma_attr_addr_hi       = 0xffffffffffffffffULL,
 286         .dma_attr_count_max     = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
 287         .dma_attr_align         = 0x1000,
 288         .dma_attr_burstsizes    = 0x7ff,
 289         .dma_attr_minxfer       = 0x1000,
 290         .dma_attr_maxxfer       = (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
 291         .dma_attr_seg           = 0xffffffffffffffffULL,
 292         .dma_attr_sgllen        = 1,
 293         .dma_attr_granular      = 1,
 294         .dma_attr_flags         = 0,
 295 };


 376         .o_drive_info   = nvme_bd_driveinfo,
 377         .o_media_info   = nvme_bd_mediainfo,
 378         .o_devid_init   = nvme_bd_devid,
 379         .o_sync_cache   = nvme_bd_sync,
 380         .o_read         = nvme_bd_read,
 381         .o_write        = nvme_bd_write,
 382 };
 383 
 384 int
 385 _init(void)
 386 {
 387         int error;
 388 
 389         error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
 390         if (error != DDI_SUCCESS)
 391                 return (error);
 392 
 393         nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
 394             sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 395 
 396         mutex_init(&nvme_global_mutex, NULL, MUTEX_DRIVER, 0);
 397 
 398         list_create(&nvme_qp_list, sizeof (nvme_qpair_t),
 399             offsetof(nvme_qpair_t, nq_list_node));
 400 
 401         nvme_cyclic = ddi_periodic_add(nvme_intr_monitor, NULL,
 402             NANOSEC * nvme_cyclic_seconds, DDI_IPL_0);
 403 
 404         bd_mod_init(&nvme_dev_ops);
 405 
 406         error = mod_install(&nvme_modlinkage);
 407         if (error != DDI_SUCCESS) {
 408                 ddi_soft_state_fini(&nvme_state);
 409                 bd_mod_fini(&nvme_dev_ops);
 410         }
 411 
 412         return (error);
 413 }
 414 
 415 int
 416 _fini(void)
 417 {
 418         int error;
 419 
 420         error = mod_remove(&nvme_modlinkage);
 421         if (error == DDI_SUCCESS) {
 422                 ddi_soft_state_fini(&nvme_state);
 423                 kmem_cache_destroy(nvme_cmd_cache);
 424                 if (nvme_cyclic != NULL) {
 425                         ddi_periodic_delete(nvme_cyclic);
 426                         nvme_cyclic = NULL;
 427                 }
 428                 mutex_destroy(&nvme_global_mutex);
 429                 bd_mod_fini(&nvme_dev_ops);
 430         }
 431 
 432         return (error);
 433 }
 434 
 435 int
 436 _info(struct modinfo *modinfop)
 437 {
 438         return (mod_info(&nvme_modlinkage, modinfop));
 439 }
 440 
 441 static inline void
 442 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
 443 {
 444         ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
 445 
 446         /*LINTED: E_BAD_PTR_CAST_ALIGN*/
 447         ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
 448 }


 713          * slot. If the slot is already occupied advance to the next slot and
 714          * try again. This can happen for long running commands like async event
 715          * requests.
 716          */
 717         while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
 718                 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 719         qp->nq_cmd[qp->nq_next_cmd] = cmd;
 720 
 721         qp->nq_active_cmds++;
 722 
 723         cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
 724         bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
 725         (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
 726             sizeof (nvme_sqe_t) * qp->nq_sqtail,
 727             sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
 728         qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 729 
 730         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
 731         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 732 
 733         if (nvme_enable_intr_monitoring)
 734                 qp->nq_ts = gethrtime();
 735         mutex_exit(&qp->nq_mutex);
 736 
 737         return (DDI_SUCCESS);
 738 }
 739 
 740 static int
 741 nvme_process_cq_cmds(nvme_t *nvme, nvme_qpair_t *qp)
 742 {
 743         nvme_reg_cqhdbl_t head = { 0 };
 744 
 745         nvme_cqe_t *cqe;
 746         nvme_cmd_t *cmd;
 747         int cnt_cmds = 0;
 748 
 749         (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
 750             sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
 751 
 752         cqe = &qp->nq_cq[qp->nq_cqhead];

 753         /* Check phase tag of CQE. Hardware inverts it for new entries. */
 754         if (cqe->cqe_sf.sf_p == qp->nq_phase)
 755                 return (cnt_cmds);
 756 
 757         mutex_enter(&qp->nq_mutex);
 758         while (cqe->cqe_sf.sf_p != qp->nq_phase) {
 759                 ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
 760                 ASSERT(cqe->cqe_cid < qp->nq_nentry);
 761 

 762                 cmd = qp->nq_cmd[cqe->cqe_cid];
 763                 qp->nq_cmd[cqe->cqe_cid] = NULL;
 764                 qp->nq_active_cmds--;

 765 
 766                 ASSERT(cmd != NULL);
 767                 ASSERT(cmd->nc_nvme == nvme);
 768                 ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
 769                 ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
 770                 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 771 
 772                 qp->nq_sqhead = cqe->cqe_sqhd;
 773 
 774                 qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 775 
 776                 /* Toggle phase on wrap-around. */
 777                 if (qp->nq_cqhead == 0)
 778                         qp->nq_phase = qp->nq_phase ? 0 : 1;
 779                 taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
 780                      cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
 781                 cnt_cmds++;
 782                 cqe = &qp->nq_cq[qp->nq_cqhead];
 783         }
 784 
 785         if (cnt_cmds != 0) {
 786                 head.b.cqhdbl_cqh = qp->nq_cqhead;
 787                 nvme_put32(nvme, qp->nq_cqhdbl, head.r);
 788                 if (nvme_enable_intr_monitoring)
 789                         qp->nq_ts = gethrtime();
 790         }
 791 
 792         mutex_exit(&qp->nq_mutex);
 793 
 794         return (cnt_cmds);
 795 }
 796 
 797 static int
 798 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 799 {
 800         nvme_cqe_t *cqe = &cmd->nc_cqe;
 801 
 802         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
 803             "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
 804             "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
 805             cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
 806             cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
 807 
 808         bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
 809 
 810         if (cmd->nc_nvme->n_strict_version) {
 811                 cmd->nc_nvme->n_dead = B_TRUE;
 812                 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
 813         }
 814 


1679         cmd->nc_sqe.sqe_cdw10 = dw10.r;
1680         cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
1681         cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
1682 
1683         if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1684                 dev_err(nvme->n_dip, CE_WARN,
1685                     "!nvme_admin_cmd failed for CREATE SQUEUE");
1686                 return (DDI_FAILURE);
1687         }
1688 
1689         if (nvme_check_cmd_status(cmd)) {
1690                 dev_err(nvme->n_dip, CE_WARN,
1691                     "!CREATE SQUEUE failed with sct = %x, sc = %x",
1692                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1693                 nvme_free_cmd(cmd);
1694                 return (DDI_FAILURE);
1695         }
1696 
1697         nvme_free_cmd(cmd);
1698 
1699         mutex_enter(&nvme_global_mutex);
1700         list_insert_head(&nvme_qp_list, qp);
1701         qp->nq_nvme = nvme;
1702         mutex_exit(&nvme_global_mutex);
1703 
1704         return (DDI_SUCCESS);
1705 }
1706 
1707 static boolean_t
1708 nvme_reset(nvme_t *nvme, boolean_t quiesce)
1709 {
1710         nvme_reg_csts_t csts;
1711         int i;
1712 
1713         nvme_put32(nvme, NVME_REG_CC, 0);
1714 
1715         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1716         if (csts.b.csts_rdy == 1) {
1717                 nvme_put32(nvme, NVME_REG_CC, 0);
1718                 for (i = 0; i != nvme->n_timeout * 10; i++) {
1719                         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1720                         if (csts.b.csts_rdy == 0)
1721                                 break;
1722 
1723                         if (quiesce)


2211                         goto fail;
2212                 }
2213         }
2214 
2215         return (DDI_SUCCESS);
2216 
2217 fail:
2218         (void) nvme_reset(nvme, B_FALSE);
2219         return (DDI_FAILURE);
2220 }
2221 
2222 static uint_t
2223 nvme_intr(caddr_t arg1, caddr_t arg2)
2224 {
2225         /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2226         nvme_t *nvme = (nvme_t *)arg1;
2227         int inum = (int)(uintptr_t)arg2;
2228         int ccnt = 0;
2229         int qnum;
2230         nvme_cmd_t *cmd;
2231         int cnt_cmds;
2232 
2233         if (inum >= nvme->n_intr_cnt)
2234                 return (DDI_INTR_UNCLAIMED);
2235 
2236         /*
2237          * The interrupt vector a queue uses is calculated as queue_idx %
2238          * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2239          * in steps of n_intr_cnt to process all queues using this vector.
2240          */
2241         for (qnum = inum;
2242             qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
2243             qnum += nvme->n_intr_cnt) {
2244                 cnt_cmds =  nvme_process_cq_cmds(nvme, nvme->n_ioq[qnum]);
2245                 ccnt += cnt_cmds;



2246         }
2247 
2248         return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
2249 }
2250 
2251 static void
2252 nvme_release_interrupts(nvme_t *nvme)
2253 {
2254         int i;
2255 
2256         for (i = 0; i < nvme->n_intr_cnt; i++) {
2257                 if (nvme->n_inth[i] == NULL)
2258                         break;
2259 
2260                 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
2261                         (void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
2262                 else
2263                         (void) ddi_intr_disable(nvme->n_inth[i]);
2264 
2265                 (void) ddi_intr_remove_handler(nvme->n_inth[i]);


2556 
2557                         if (nvme->n_ns[i].ns_idns)
2558                                 kmem_free(nvme->n_ns[i].ns_idns,
2559                                     sizeof (nvme_identify_nsid_t));
2560                 }
2561 
2562                 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
2563                     nvme->n_namespace_count);
2564         }
2565 
2566         if (nvme->n_progress & NVME_INTERRUPTS)
2567                 nvme_release_interrupts(nvme);
2568 
2569         if (nvme->n_cmd_taskq)
2570                 ddi_taskq_wait(nvme->n_cmd_taskq);
2571 
2572         if (nvme->n_ioq_count > 0) {
2573                 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2574                         if (nvme->n_ioq[i] != NULL) {
2575                                 /* TODO: send destroy queue commands */
2576                                 mutex_enter(&nvme_global_mutex);
2577                                 list_remove(&nvme_qp_list, nvme->n_ioq[i]);
2578                                 mutex_exit(&nvme_global_mutex);
2579                                 nvme_free_qpair(nvme->n_ioq[i]);
2580                         }
2581                 }
2582 
2583                 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
2584                     (nvme->n_ioq_count + 1));
2585         }
2586 
2587         if (nvme->n_progress & NVME_REGS_MAPPED) {
2588                 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
2589                 (void) nvme_reset(nvme, B_FALSE);
2590         }
2591 
2592         if (nvme->n_cmd_taskq)
2593                 ddi_taskq_destroy(nvme->n_cmd_taskq);
2594 
2595         if (nvme->n_progress & NVME_CTRL_LIMITS)
2596                 sema_destroy(&nvme->n_abort_sema);
2597 
2598         if (nvme->n_progress & NVME_ADMIN_QUEUE)


2871 
2872         /*
2873          * If the volatile write cache isn't enabled the FLUSH command is a
2874          * no-op, so we can take a shortcut here.
2875          */
2876         if (ns->ns_nvme->n_volatile_write_cache_enabled == B_FALSE) {
2877                 bd_xfer_done(xfer, ENOTSUP);
2878                 return (0);
2879         }
2880 
2881         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
2882 }
2883 
2884 static int
2885 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
2886 {
2887         nvme_namespace_t *ns = arg;
2888 
2889         return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
2890             ns->ns_devid, devid));
2891 }
2892 
2893 static void
2894 nvme_intr_monitor(void *arg)
2895 {
2896         nvme_qpair_t *qp;
2897         hrtime_t diff, now_ns;
2898 
2899         if (!nvme_enable_intr_monitoring)
2900                 return;
2901         mutex_enter(&nvme_global_mutex);
2902         now_ns = gethrtime();
2903         for (qp = list_head(&nvme_qp_list); qp != NULL;
2904             qp = list_next(&nvme_qp_list, qp)) {
2905                 diff = now_ns - qp->nq_ts;
2906                 if (diff >= nvme_intr_timeout_ns && qp->nq_active_cmds > 0) {
2907                         if (nvme_process_cq_cmds(qp->nq_nvme, qp)) {
2908                                 nvme_intr_timeouts++;
2909                                 qp->nq_nvme->n_intr_timeouts++;
2910                         }
2911                 }
2912         }
2913         mutex_exit(&nvme_global_mutex);
2914 }