1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * xdf.c - Xen Virtual Block Device Driver
  29  * TODO:
  30  *      - support alternate block size (currently only DEV_BSIZE supported)
  31  *      - revalidate geometry for removable devices
  32  *
  33  * This driver export solaris disk device nodes, accepts IO requests from
  34  * those nodes, and services those requests by talking to a backend device
  35  * in another domain.
  36  *
  37  * Communication with the backend device is done via a ringbuffer (which is
  38  * managed via xvdi interfaces) and dma memory (which is managed via ddi
  39  * interfaces).
  40  *
  41  * Communication with the backend device is dependant upon establishing a
  42  * connection to the backend device.  This connection process involves
  43  * reading device configuration information from xenbus and publishing
  44  * some frontend runtime configuration parameters via the xenbus (for
  45  * consumption by the backend).  Once we've published runtime configuration
  46  * information via the xenbus, the backend device can enter the connected
  47  * state and we'll enter the XD_CONNECTED state.  But before we can allow
  48  * random IO to begin, we need to do IO to the backend device to determine
  49  * the device label and if flush operations are supported.  Once this is
  50  * done we enter the XD_READY state and can process any IO operations.
  51  *
  52  * We recieve notifications of xenbus state changes for the backend device
  53  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
  54  * is single threaded, meaning that we can't recieve new notification of
  55  * other end state changes while we're processing an outstanding
  56  * notification of an other end state change.  There for we can't do any
  57  * blocking operations from the xdf_oe_change() callback.  This is why we
  58  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
  59  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
  60  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
  61  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
  62  * generated by the xdf_ready_tq_thread thread have priority over all
  63  * other IO requests.
  64  *
  65  * We also communicate with the backend device via the xenbus "media-req"
  66  * (XBP_MEDIA_REQ) property.  For more information on this see the
  67  * comments in blkif.h.
  68  */
  69 
  70 #include <io/xdf.h>
  71 
  72 #include <sys/conf.h>
  73 #include <sys/dkio.h>
  74 #include <sys/promif.h>
  75 #include <sys/sysmacros.h>
  76 #include <sys/kstat.h>
  77 #include <sys/mach_mmu.h>
  78 #ifdef XPV_HVM_DRIVER
  79 #include <sys/xpv_support.h>
  80 #include <sys/sunndi.h>
  81 #else /* !XPV_HVM_DRIVER */
  82 #include <sys/evtchn_impl.h>
  83 #endif /* !XPV_HVM_DRIVER */
  84 #include <public/io/xenbus.h>
  85 #include <xen/sys/xenbus_impl.h>
  86 #include <sys/scsi/generic/inquiry.h>
  87 #include <xen/io/blkif_impl.h>
  88 #include <sys/fdio.h>
  89 #include <sys/cdio.h>
  90 
  91 /*
  92  * DEBUG_EVAL can be used to include debug only statements without
  93  * having to use '#ifdef DEBUG' statements
  94  */
  95 #ifdef DEBUG
  96 #define DEBUG_EVAL(x)   (x)
  97 #else /* !DEBUG */
  98 #define DEBUG_EVAL(x)
  99 #endif /* !DEBUG */
 100 
 101 #define XDF_DRAIN_MSEC_DELAY            (50*1000)       /* 00.05 sec */
 102 #define XDF_DRAIN_RETRY_COUNT           200             /* 10.00 sec */
 103 
 104 #define INVALID_DOMID   ((domid_t)-1)
 105 #define FLUSH_DISKCACHE 0x1
 106 #define WRITE_BARRIER   0x2
 107 #define DEFAULT_FLUSH_BLOCK     156 /* block to write to cause a cache flush */
 108 #define USE_WRITE_BARRIER(vdp)                                          \
 109         ((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
 110 #define USE_FLUSH_DISKCACHE(vdp)                                        \
 111         ((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
 112 #define IS_WRITE_BARRIER(vdp, bp)                                       \
 113         (!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&                      \
 114         ((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
 115 #define IS_FLUSH_DISKCACHE(bp)                                          \
 116         (!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
 117 
 118 #define VREQ_DONE(vreq)                                                 \
 119         VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&               \
 120             (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||               \
 121             (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
 122 
 123 #define BP_VREQ(bp)             ((v_req_t *)((bp)->av_back))
 124 #define BP_VREQ_SET(bp, vreq)   (((bp)->av_back = (buf_t *)(vreq)))
 125 
 126 extern int              do_polled_io;
 127 
 128 /* run-time tunables that we don't want the compiler to optimize away */
 129 volatile int            xdf_debug = 0;
 130 volatile boolean_t      xdf_barrier_flush_disable = B_FALSE;
 131 
 132 /* per module globals */
 133 major_t                 xdf_major;
 134 static void             *xdf_ssp;
 135 static kmem_cache_t     *xdf_vreq_cache;
 136 static kmem_cache_t     *xdf_gs_cache;
 137 static int              xdf_maxphys = XB_MAXPHYS;
 138 static diskaddr_t       xdf_flush_block = DEFAULT_FLUSH_BLOCK;
 139 static int              xdf_fbrewrites; /* flush block re-write count */
 140 
 141 /* misc public functions (used by xdf_shell.c) */
 142 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
 143 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
 144 
 145 /*  misc private functions */
 146 static void xdf_io_start(xdf_t *);
 147 
 148 /* callbacks from commmon label */
 149 static cmlb_tg_ops_t xdf_lb_ops = {
 150         TG_DK_OPS_VERSION_1,
 151         xdf_lb_rdwr,
 152         xdf_lb_getinfo
 153 };
 154 
 155 /*
 156  * I/O buffer DMA attributes
 157  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
 158  */
 159 static ddi_dma_attr_t xb_dma_attr = {
 160         DMA_ATTR_V0,
 161         (uint64_t)0,                    /* lowest address */
 162         (uint64_t)0xffffffffffffffff,   /* highest usable address */
 163         (uint64_t)0xffffff,             /* DMA counter limit max */
 164         (uint64_t)XB_BSIZE,             /* alignment in bytes */
 165         XB_BSIZE - 1,                   /* bitmap of burst sizes */
 166         XB_BSIZE,                       /* min transfer */
 167         (uint64_t)XB_MAX_XFER,          /* maximum transfer */
 168         (uint64_t)PAGEOFFSET,           /* 1 page segment length  */
 169         BLKIF_MAX_SEGMENTS_PER_REQUEST, /* maximum number of segments */
 170         XB_BSIZE,                       /* granularity */
 171         0,                              /* flags (reserved) */
 172 };
 173 
 174 static ddi_device_acc_attr_t xc_acc_attr = {
 175         DDI_DEVICE_ATTR_V0,
 176         DDI_NEVERSWAP_ACC,
 177         DDI_STRICTORDER_ACC
 178 };
 179 
 180 static void
 181 xdf_timeout_handler(void *arg)
 182 {
 183         xdf_t *vdp = arg;
 184 
 185         mutex_enter(&vdp->xdf_dev_lk);
 186         vdp->xdf_timeout_id = 0;
 187         mutex_exit(&vdp->xdf_dev_lk);
 188 
 189         /* new timeout thread could be re-scheduled */
 190         xdf_io_start(vdp);
 191 }
 192 
 193 /*
 194  * callback func when DMA/GTE resources is available
 195  *
 196  * Note: we only register one callback function to grant table subsystem
 197  * since we only have one 'struct gnttab_free_callback' in xdf_t.
 198  */
 199 static int
 200 xdf_dmacallback(caddr_t arg)
 201 {
 202         xdf_t *vdp = (xdf_t *)arg;
 203         ASSERT(vdp != NULL);
 204 
 205         DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
 206             vdp->xdf_addr));
 207 
 208         ddi_trigger_softintr(vdp->xdf_softintr_id);
 209         return (DDI_DMA_CALLBACK_DONE);
 210 }
 211 
 212 static ge_slot_t *
 213 gs_get(xdf_t *vdp, int isread)
 214 {
 215         grant_ref_t gh;
 216         ge_slot_t *gs;
 217 
 218         /* try to alloc GTEs needed in this slot, first */
 219         if (gnttab_alloc_grant_references(
 220             BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
 221                 if (vdp->xdf_gnt_callback.next == NULL) {
 222                         SETDMACBON(vdp);
 223                         gnttab_request_free_callback(
 224                             &vdp->xdf_gnt_callback,
 225                             (void (*)(void *))xdf_dmacallback,
 226                             (void *)vdp,
 227                             BLKIF_MAX_SEGMENTS_PER_REQUEST);
 228                 }
 229                 return (NULL);
 230         }
 231 
 232         gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
 233         if (gs == NULL) {
 234                 gnttab_free_grant_references(gh);
 235                 if (vdp->xdf_timeout_id == 0)
 236                         /* restart I/O after one second */
 237                         vdp->xdf_timeout_id =
 238                             timeout(xdf_timeout_handler, vdp, hz);
 239                 return (NULL);
 240         }
 241 
 242         /* init gs_slot */
 243         gs->gs_oeid = vdp->xdf_peer;
 244         gs->gs_isread = isread;
 245         gs->gs_ghead = gh;
 246         gs->gs_ngrefs = 0;
 247 
 248         return (gs);
 249 }
 250 
 251 static void
 252 gs_free(ge_slot_t *gs)
 253 {
 254         int             i;
 255 
 256         /* release all grant table entry resources used in this slot */
 257         for (i = 0; i < gs->gs_ngrefs; i++)
 258                 gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
 259         gnttab_free_grant_references(gs->gs_ghead);
 260         list_remove(&gs->gs_vreq->v_gs, gs);
 261         kmem_cache_free(xdf_gs_cache, gs);
 262 }
 263 
 264 static grant_ref_t
 265 gs_grant(ge_slot_t *gs, mfn_t mfn)
 266 {
 267         grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
 268 
 269         ASSERT(gr != -1);
 270         ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
 271         gs->gs_ge[gs->gs_ngrefs++] = gr;
 272         gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
 273 
 274         return (gr);
 275 }
 276 
 277 /*
 278  * Alloc a vreq for this bp
 279  * bp->av_back contains the pointer to the vreq upon return
 280  */
 281 static v_req_t *
 282 vreq_get(xdf_t *vdp, buf_t *bp)
 283 {
 284         v_req_t *vreq = NULL;
 285 
 286         ASSERT(BP_VREQ(bp) == NULL);
 287 
 288         vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
 289         if (vreq == NULL) {
 290                 if (vdp->xdf_timeout_id == 0)
 291                         /* restart I/O after one second */
 292                         vdp->xdf_timeout_id =
 293                             timeout(xdf_timeout_handler, vdp, hz);
 294                 return (NULL);
 295         }
 296         bzero(vreq, sizeof (v_req_t));
 297         list_create(&vreq->v_gs, sizeof (ge_slot_t),
 298             offsetof(ge_slot_t, gs_vreq_link));
 299         vreq->v_buf = bp;
 300         vreq->v_status = VREQ_INIT;
 301         vreq->v_runq = B_FALSE;
 302         BP_VREQ_SET(bp, vreq);
 303         /* init of other fields in vreq is up to the caller */
 304 
 305         list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
 306 
 307         return (vreq);
 308 }
 309 
 310 static void
 311 vreq_free(xdf_t *vdp, v_req_t *vreq)
 312 {
 313         buf_t   *bp = vreq->v_buf;
 314 
 315         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 316         ASSERT(BP_VREQ(bp) == vreq);
 317 
 318         list_remove(&vdp->xdf_vreq_act, vreq);
 319 
 320         if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
 321                 goto done;
 322 
 323         switch (vreq->v_status) {
 324         case VREQ_DMAWIN_DONE:
 325         case VREQ_GS_ALLOCED:
 326         case VREQ_DMABUF_BOUND:
 327                 (void) ddi_dma_unbind_handle(vreq->v_dmahdl);
 328                 /*FALLTHRU*/
 329         case VREQ_DMAMEM_ALLOCED:
 330                 if (!ALIGNED_XFER(bp)) {
 331                         ASSERT(vreq->v_abuf != NULL);
 332                         if (!IS_ERROR(bp) && IS_READ(bp))
 333                                 bcopy(vreq->v_abuf, bp->b_un.b_addr,
 334                                     bp->b_bcount);
 335                         ddi_dma_mem_free(&vreq->v_align);
 336                 }
 337                 /*FALLTHRU*/
 338         case VREQ_MEMDMAHDL_ALLOCED:
 339                 if (!ALIGNED_XFER(bp))
 340                         ddi_dma_free_handle(&vreq->v_memdmahdl);
 341                 /*FALLTHRU*/
 342         case VREQ_DMAHDL_ALLOCED:
 343                 ddi_dma_free_handle(&vreq->v_dmahdl);
 344                 break;
 345         default:
 346                 break;
 347         }
 348 done:
 349         ASSERT(!vreq->v_runq);
 350         list_destroy(&vreq->v_gs);
 351         kmem_cache_free(xdf_vreq_cache, vreq);
 352 }
 353 
 354 /*
 355  * Snarf new data if our flush block was re-written
 356  */
 357 static void
 358 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
 359 {
 360         int nblks;
 361         boolean_t mapin;
 362 
 363         if (IS_WRITE_BARRIER(vdp, bp))
 364                 return; /* write was a flush write */
 365 
 366         mapin = B_FALSE;
 367         nblks = bp->b_bcount >> DEV_BSHIFT;
 368         if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
 369                 xdf_fbrewrites++;
 370                 if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
 371                         mapin = B_TRUE;
 372                         bp_mapin(bp);
 373                 }
 374                 bcopy(bp->b_un.b_addr +
 375                     ((xdf_flush_block - blkno) << DEV_BSHIFT),
 376                     vdp->xdf_cache_flush_block, DEV_BSIZE);
 377                 if (mapin)
 378                         bp_mapout(bp);
 379         }
 380 }
 381 
 382 /*
 383  * Initalize the DMA and grant table resources for the buf
 384  */
 385 static int
 386 vreq_setup(xdf_t *vdp, v_req_t *vreq)
 387 {
 388         int rc;
 389         ddi_dma_attr_t dmaattr;
 390         uint_t ndcs, ndws;
 391         ddi_dma_handle_t dh;
 392         ddi_dma_handle_t mdh;
 393         ddi_dma_cookie_t dc;
 394         ddi_acc_handle_t abh;
 395         caddr_t aba;
 396         ge_slot_t *gs;
 397         size_t bufsz;
 398         off_t off;
 399         size_t sz;
 400         buf_t *bp = vreq->v_buf;
 401         int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
 402             DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
 403 
 404         switch (vreq->v_status) {
 405         case VREQ_INIT:
 406                 if (IS_FLUSH_DISKCACHE(bp)) {
 407                         if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
 408                                 DPRINTF(DMA_DBG, ("xdf@%s: "
 409                                     "get ge_slotfailed\n", vdp->xdf_addr));
 410                                 return (DDI_FAILURE);
 411                         }
 412                         vreq->v_blkno = 0;
 413                         vreq->v_nslots = 1;
 414                         vreq->v_flush_diskcache = FLUSH_DISKCACHE;
 415                         vreq->v_status = VREQ_GS_ALLOCED;
 416                         gs->gs_vreq = vreq;
 417                         list_insert_head(&vreq->v_gs, gs);
 418                         return (DDI_SUCCESS);
 419                 }
 420 
 421                 if (IS_WRITE_BARRIER(vdp, bp))
 422                         vreq->v_flush_diskcache = WRITE_BARRIER;
 423                 vreq->v_blkno = bp->b_blkno +
 424                     (diskaddr_t)(uintptr_t)bp->b_private;
 425                 /* See if we wrote new data to our flush block */
 426                 if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
 427                         check_fbwrite(vdp, bp, vreq->v_blkno);
 428                 vreq->v_status = VREQ_INIT_DONE;
 429                 /*FALLTHRU*/
 430 
 431         case VREQ_INIT_DONE:
 432                 /*
 433                  * alloc DMA handle
 434                  */
 435                 rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
 436                     xdf_dmacallback, (caddr_t)vdp, &dh);
 437                 if (rc != DDI_SUCCESS) {
 438                         SETDMACBON(vdp);
 439                         DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
 440                             vdp->xdf_addr));
 441                         return (DDI_FAILURE);
 442                 }
 443 
 444                 vreq->v_dmahdl = dh;
 445                 vreq->v_status = VREQ_DMAHDL_ALLOCED;
 446                 /*FALLTHRU*/
 447 
 448         case VREQ_DMAHDL_ALLOCED:
 449                 /*
 450                  * alloc dma handle for 512-byte aligned buf
 451                  */
 452                 if (!ALIGNED_XFER(bp)) {
 453                         /*
 454                          * XXPV: we need to temporarily enlarge the seg
 455                          * boundary and s/g length to work round CR6381968
 456                          */
 457                         dmaattr = xb_dma_attr;
 458                         dmaattr.dma_attr_seg = (uint64_t)-1;
 459                         dmaattr.dma_attr_sgllen = INT_MAX;
 460                         rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
 461                             xdf_dmacallback, (caddr_t)vdp, &mdh);
 462                         if (rc != DDI_SUCCESS) {
 463                                 SETDMACBON(vdp);
 464                                 DPRINTF(DMA_DBG, ("xdf@%s: "
 465                                     "unaligned buf DMAhandle alloc failed\n",
 466                                     vdp->xdf_addr));
 467                                 return (DDI_FAILURE);
 468                         }
 469                         vreq->v_memdmahdl = mdh;
 470                         vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
 471                 }
 472                 /*FALLTHRU*/
 473 
 474         case VREQ_MEMDMAHDL_ALLOCED:
 475                 /*
 476                  * alloc 512-byte aligned buf
 477                  */
 478                 if (!ALIGNED_XFER(bp)) {
 479                         if (bp->b_flags & (B_PAGEIO | B_PHYS))
 480                                 bp_mapin(bp);
 481                         rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
 482                             roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
 483                             DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
 484                             &aba, &bufsz, &abh);
 485                         if (rc != DDI_SUCCESS) {
 486                                 SETDMACBON(vdp);
 487                                 DPRINTF(DMA_DBG, ("xdf@%s: "
 488                                     "DMA mem allocation failed\n",
 489                                     vdp->xdf_addr));
 490                                 return (DDI_FAILURE);
 491                         }
 492 
 493                         vreq->v_abuf = aba;
 494                         vreq->v_align = abh;
 495                         vreq->v_status = VREQ_DMAMEM_ALLOCED;
 496 
 497                         ASSERT(bufsz >= bp->b_bcount);
 498                         if (!IS_READ(bp))
 499                                 bcopy(bp->b_un.b_addr, vreq->v_abuf,
 500                                     bp->b_bcount);
 501                 }
 502                 /*FALLTHRU*/
 503 
 504         case VREQ_DMAMEM_ALLOCED:
 505                 /*
 506                  * dma bind
 507                  */
 508                 if (ALIGNED_XFER(bp)) {
 509                         rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
 510                             dma_flags, xdf_dmacallback, (caddr_t)vdp,
 511                             &dc, &ndcs);
 512                 } else {
 513                         rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
 514                             NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
 515                             xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
 516                 }
 517                 if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
 518                         /* get num of dma windows */
 519                         if (rc == DDI_DMA_PARTIAL_MAP) {
 520                                 rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
 521                                 ASSERT(rc == DDI_SUCCESS);
 522                         } else {
 523                                 ndws = 1;
 524                         }
 525                 } else {
 526                         SETDMACBON(vdp);
 527                         DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
 528                             vdp->xdf_addr));
 529                         return (DDI_FAILURE);
 530                 }
 531 
 532                 vreq->v_dmac = dc;
 533                 vreq->v_dmaw = 0;
 534                 vreq->v_ndmacs = ndcs;
 535                 vreq->v_ndmaws = ndws;
 536                 vreq->v_nslots = ndws;
 537                 vreq->v_status = VREQ_DMABUF_BOUND;
 538                 /*FALLTHRU*/
 539 
 540         case VREQ_DMABUF_BOUND:
 541                 /*
 542                  * get ge_slot, callback is set upon failure from gs_get(),
 543                  * if not set previously
 544                  */
 545                 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
 546                         DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
 547                             vdp->xdf_addr));
 548                         return (DDI_FAILURE);
 549                 }
 550 
 551                 vreq->v_status = VREQ_GS_ALLOCED;
 552                 gs->gs_vreq = vreq;
 553                 list_insert_head(&vreq->v_gs, gs);
 554                 break;
 555 
 556         case VREQ_GS_ALLOCED:
 557                 /* nothing need to be done */
 558                 break;
 559 
 560         case VREQ_DMAWIN_DONE:
 561                 /*
 562                  * move to the next dma window
 563                  */
 564                 ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
 565 
 566                 /* get a ge_slot for this DMA window */
 567                 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
 568                         DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
 569                             vdp->xdf_addr));
 570                         return (DDI_FAILURE);
 571                 }
 572 
 573                 vreq->v_dmaw++;
 574                 VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
 575                     &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
 576                 vreq->v_status = VREQ_GS_ALLOCED;
 577                 gs->gs_vreq = vreq;
 578                 list_insert_head(&vreq->v_gs, gs);
 579                 break;
 580 
 581         default:
 582                 return (DDI_FAILURE);
 583         }
 584 
 585         return (DDI_SUCCESS);
 586 }
 587 
 588 static int
 589 xdf_cmlb_attach(xdf_t *vdp)
 590 {
 591         dev_info_t      *dip = vdp->xdf_dip;
 592 
 593         return (cmlb_attach(dip, &xdf_lb_ops,
 594             XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
 595             XD_IS_RM(vdp),
 596             B_TRUE,
 597             XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
 598 #if defined(XPV_HVM_DRIVER)
 599             (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) |
 600             CMLB_INTERNAL_MINOR_NODES,
 601 #else /* !XPV_HVM_DRIVER */
 602             XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION,
 603 #endif /* !XPV_HVM_DRIVER */
 604             vdp->xdf_vd_lbl, NULL));
 605 }
 606 
 607 static void
 608 xdf_io_err(buf_t *bp, int err, size_t resid)
 609 {
 610         bioerror(bp, err);
 611         if (resid == 0)
 612                 bp->b_resid = bp->b_bcount;
 613         biodone(bp);
 614 }
 615 
 616 static void
 617 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
 618 {
 619         v_req_t *vreq = BP_VREQ(bp);
 620 
 621         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 622 
 623         if (vdp->xdf_xdev_iostat == NULL)
 624                 return;
 625         if ((vreq != NULL) && vreq->v_runq) {
 626                 kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 627         } else {
 628                 kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 629         }
 630 }
 631 
 632 static void
 633 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
 634 {
 635         v_req_t *vreq = BP_VREQ(bp);
 636 
 637         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 638 
 639         if (vdp->xdf_xdev_iostat == NULL)
 640                 return;
 641         if ((vreq != NULL) && vreq->v_runq) {
 642                 kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 643         } else {
 644                 kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 645         }
 646 }
 647 
 648 static void
 649 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
 650 {
 651         v_req_t *vreq = BP_VREQ(bp);
 652 
 653         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 654         ASSERT(!vreq->v_runq);
 655 
 656         vreq->v_runq = B_TRUE;
 657         if (vdp->xdf_xdev_iostat == NULL)
 658                 return;
 659         kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 660 }
 661 
 662 static void
 663 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
 664 {
 665         v_req_t *vreq = BP_VREQ(bp);
 666 
 667         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 668         ASSERT(vreq->v_runq);
 669 
 670         vreq->v_runq = B_FALSE;
 671         if (vdp->xdf_xdev_iostat == NULL)
 672                 return;
 673         kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 674 }
 675 
 676 int
 677 xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance)
 678 {
 679         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
 680         kstat_t         *kstat;
 681         buf_t           *bp;
 682 
 683         if ((kstat = kstat_create(
 684             ks_module, instance, NULL, "disk",
 685             KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
 686                 return (-1);
 687 
 688         /* See comment about locking in xdf_kstat_delete(). */
 689         mutex_enter(&vdp->xdf_iostat_lk);
 690         mutex_enter(&vdp->xdf_dev_lk);
 691 
 692         /* only one kstat can exist at a time */
 693         if (vdp->xdf_xdev_iostat != NULL) {
 694                 mutex_exit(&vdp->xdf_dev_lk);
 695                 mutex_exit(&vdp->xdf_iostat_lk);
 696                 kstat_delete(kstat);
 697                 return (-1);
 698         }
 699 
 700         vdp->xdf_xdev_iostat = kstat;
 701         vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
 702         kstat_install(vdp->xdf_xdev_iostat);
 703 
 704         /*
 705          * Now that we've created a kstat, we need to update the waitq and
 706          * runq counts for the kstat to reflect our current state.
 707          *
 708          * For a buf_t structure to be on the runq, it must have a ring
 709          * buffer slot associated with it.  To get a ring buffer slot the
 710          * buf must first have a v_req_t and a ge_slot_t associated with it.
 711          * Then when it is granted a ring buffer slot, v_runq will be set to
 712          * true.
 713          *
 714          * For a buf_t structure to be on the waitq, it must not be on the
 715          * runq.  So to find all the buf_t's that should be on waitq, we
 716          * walk the active buf list and add any buf_t's which aren't on the
 717          * runq to the waitq.
 718          */
 719         bp = vdp->xdf_f_act;
 720         while (bp != NULL) {
 721                 xdf_kstat_enter(vdp, bp);
 722                 bp = bp->av_forw;
 723         }
 724         if (vdp->xdf_ready_tq_bp != NULL)
 725                 xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
 726 
 727         mutex_exit(&vdp->xdf_dev_lk);
 728         mutex_exit(&vdp->xdf_iostat_lk);
 729         return (0);
 730 }
 731 
 732 void
 733 xdf_kstat_delete(dev_info_t *dip)
 734 {
 735         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
 736         kstat_t         *kstat;
 737         buf_t           *bp;
 738 
 739         /*
 740          * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
 741          * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
 742          * and the contents of the our kstat.  xdf_iostat_lk is used
 743          * to protect the allocation and freeing of the actual kstat.
 744          * xdf_dev_lk can't be used for this purpose because kstat
 745          * readers use it to access the contents of the kstat and
 746          * hence it can't be held when calling kstat_delete().
 747          */
 748         mutex_enter(&vdp->xdf_iostat_lk);
 749         mutex_enter(&vdp->xdf_dev_lk);
 750 
 751         if (vdp->xdf_xdev_iostat == NULL) {
 752                 mutex_exit(&vdp->xdf_dev_lk);
 753                 mutex_exit(&vdp->xdf_iostat_lk);
 754                 return;
 755         }
 756 
 757         /*
 758          * We're about to destroy the kstat structures, so it isn't really
 759          * necessary to update the runq and waitq counts.  But, since this
 760          * isn't a hot code path we can afford to be a little pedantic and
 761          * go ahead and decrement the runq and waitq kstat counters to zero
 762          * before free'ing them.  This helps us ensure that we've gotten all
 763          * our accounting correct.
 764          *
 765          * For an explanation of how we determine which buffers go on the
 766          * runq vs which go on the waitq, see the comments in
 767          * xdf_kstat_create().
 768          */
 769         bp = vdp->xdf_f_act;
 770         while (bp != NULL) {
 771                 xdf_kstat_exit(vdp, bp);
 772                 bp = bp->av_forw;
 773         }
 774         if (vdp->xdf_ready_tq_bp != NULL)
 775                 xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
 776 
 777         kstat = vdp->xdf_xdev_iostat;
 778         vdp->xdf_xdev_iostat = NULL;
 779         mutex_exit(&vdp->xdf_dev_lk);
 780         kstat_delete(kstat);
 781         mutex_exit(&vdp->xdf_iostat_lk);
 782 }
 783 
 784 /*
 785  * Add an IO requests onto the active queue.
 786  *
 787  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
 788  * are used to establish a connection to the backend, so they recieve
 789  * priority over all other IOs.  Since xdf_ready_tq_thread only does
 790  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
 791  * given time and we record the buf associated with that request in
 792  * xdf_ready_tq_bp.
 793  */
 794 static void
 795 xdf_bp_push(xdf_t *vdp, buf_t *bp)
 796 {
 797         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 798         ASSERT(bp->av_forw == NULL);
 799 
 800         xdf_kstat_enter(vdp, bp);
 801 
 802         if (curthread == vdp->xdf_ready_tq_thread) {
 803                 /* new IO requests from the ready thread */
 804                 ASSERT(vdp->xdf_ready_tq_bp == NULL);
 805                 vdp->xdf_ready_tq_bp = bp;
 806                 return;
 807         }
 808 
 809         /* this is normal IO request */
 810         ASSERT(bp != vdp->xdf_ready_tq_bp);
 811 
 812         if (vdp->xdf_f_act == NULL) {
 813                 /* this is only only IO on the active queue */
 814                 ASSERT(vdp->xdf_l_act == NULL);
 815                 ASSERT(vdp->xdf_i_act == NULL);
 816                 vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
 817                 return;
 818         }
 819 
 820         /* add this IO to the tail of the active queue */
 821         vdp->xdf_l_act->av_forw = bp;
 822         vdp->xdf_l_act = bp;
 823         if (vdp->xdf_i_act == NULL)
 824                 vdp->xdf_i_act = bp;
 825 }
 826 
 827 static void
 828 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
 829 {
 830         buf_t   *bp_iter;
 831 
 832         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 833         ASSERT(VREQ_DONE(BP_VREQ(bp)));
 834 
 835         if (vdp->xdf_ready_tq_bp == bp) {
 836                 /* we're done with a ready thread IO request */
 837                 ASSERT(bp->av_forw == NULL);
 838                 vdp->xdf_ready_tq_bp = NULL;
 839                 return;
 840         }
 841 
 842         /* we're done with a normal IO request */
 843         ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
 844         ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
 845         ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
 846         ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
 847 
 848         if (bp == vdp->xdf_f_act) {
 849                 /* This IO was at the head of our active queue. */
 850                 vdp->xdf_f_act = bp->av_forw;
 851                 if (bp == vdp->xdf_l_act)
 852                         vdp->xdf_l_act = NULL;
 853         } else {
 854                 /* There IO finished before some other pending IOs. */
 855                 bp_iter = vdp->xdf_f_act;
 856                 while (bp != bp_iter->av_forw) {
 857                         bp_iter = bp_iter->av_forw;
 858                         ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
 859                         ASSERT(bp_iter != vdp->xdf_i_act);
 860                 }
 861                 bp_iter->av_forw = bp->av_forw;
 862                 if (bp == vdp->xdf_l_act)
 863                         vdp->xdf_l_act = bp_iter;
 864         }
 865         bp->av_forw = NULL;
 866 }
 867 
 868 static buf_t *
 869 xdf_bp_next(xdf_t *vdp)
 870 {
 871         v_req_t *vreq;
 872         buf_t   *bp;
 873 
 874         if (vdp->xdf_state == XD_CONNECTED) {
 875                 /*
 876                  * If we're in the XD_CONNECTED state, we only service IOs
 877                  * from the xdf_ready_tq_thread thread.
 878                  */
 879                 if ((bp = vdp->xdf_ready_tq_bp) == NULL)
 880                         return (NULL);
 881                 if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
 882                         return (bp);
 883                 return (NULL);
 884         }
 885 
 886         /* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
 887         if (vdp->xdf_state != XD_READY)
 888                 return (NULL);
 889 
 890         ASSERT(vdp->xdf_ready_tq_bp == NULL);
 891         for (;;) {
 892                 if ((bp = vdp->xdf_i_act) == NULL)
 893                         return (NULL);
 894                 if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
 895                         return (bp);
 896 
 897                 /* advance the active buf index pointer */
 898                 vdp->xdf_i_act = bp->av_forw;
 899         }
 900 }
 901 
 902 static void
 903 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
 904 {
 905         ge_slot_t       *gs = (ge_slot_t *)(uintptr_t)id;
 906         v_req_t         *vreq = gs->gs_vreq;
 907         buf_t           *bp = vreq->v_buf;
 908 
 909         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 910         ASSERT(BP_VREQ(bp) == vreq);
 911 
 912         gs_free(gs);
 913 
 914         if (bioerr != 0)
 915                 bioerror(bp, bioerr);
 916         ASSERT(vreq->v_nslots > 0);
 917         if (--vreq->v_nslots > 0)
 918                 return;
 919 
 920         /* remove this IO from our active queue */
 921         xdf_bp_pop(vdp, bp);
 922 
 923         ASSERT(vreq->v_runq);
 924         xdf_kstat_exit(vdp, bp);
 925         vreq->v_runq = B_FALSE;
 926         vreq_free(vdp, vreq);
 927 
 928         if (IS_ERROR(bp)) {
 929                 xdf_io_err(bp, geterror(bp), 0);
 930         } else if (bp->b_resid != 0) {
 931                 /* Partial transfers are an error */
 932                 xdf_io_err(bp, EIO, bp->b_resid);
 933         } else {
 934                 biodone(bp);
 935         }
 936 }
 937 
 938 /*
 939  * xdf interrupt handler
 940  */
 941 static uint_t
 942 xdf_intr_locked(xdf_t *vdp)
 943 {
 944         xendev_ring_t *xbr;
 945         blkif_response_t *resp;
 946         int bioerr;
 947         uint64_t id;
 948         uint8_t op;
 949         uint16_t status;
 950         ddi_acc_handle_t acchdl;
 951 
 952         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 953 
 954         if ((xbr = vdp->xdf_xb_ring) == NULL)
 955                 return (DDI_INTR_UNCLAIMED);
 956 
 957         acchdl = vdp->xdf_xb_ring_hdl;
 958 
 959         /*
 960          * complete all requests which have a response
 961          */
 962         while (resp = xvdi_ring_get_response(xbr)) {
 963                 id = ddi_get64(acchdl, &resp->id);
 964                 op = ddi_get8(acchdl, &resp->operation);
 965                 status = ddi_get16(acchdl, (uint16_t *)&resp->status);
 966                 DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
 967                     op, id, status));
 968 
 969                 if (status != BLKIF_RSP_OKAY) {
 970                         DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
 971                             vdp->xdf_addr,
 972                             (op == BLKIF_OP_READ) ? "reading" : "writing"));
 973                         bioerr = EIO;
 974                 } else {
 975                         bioerr = 0;
 976                 }
 977 
 978                 xdf_io_fini(vdp, id, bioerr);
 979         }
 980         return (DDI_INTR_CLAIMED);
 981 }
 982 
 983 /*
 984  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
 985  * block at a lower pil.
 986  */
 987 static uint_t
 988 xdf_intr(caddr_t arg)
 989 {
 990         xdf_t *vdp = (xdf_t *)arg;
 991         int rv;
 992 
 993         mutex_enter(&vdp->xdf_dev_lk);
 994         rv = xdf_intr_locked(vdp);
 995         mutex_exit(&vdp->xdf_dev_lk);
 996 
 997         if (!do_polled_io)
 998                 xdf_io_start(vdp);
 999 
1000         return (rv);
1001 }
1002 
1003 static void
1004 xdf_ring_push(xdf_t *vdp)
1005 {
1006         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1007 
1008         if (vdp->xdf_xb_ring == NULL)
1009                 return;
1010 
1011         if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1012                 DPRINTF(IO_DBG, (
1013                     "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1014                     vdp->xdf_addr));
1015         }
1016 
1017         if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1018                 xvdi_notify_oe(vdp->xdf_dip);
1019 }
1020 
1021 static int
1022 xdf_ring_drain_locked(xdf_t *vdp)
1023 {
1024         int             pollc, rv = 0;
1025 
1026         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1027 
1028         if (xdf_debug & SUSRES_DBG)
1029                 xen_printf("xdf_ring_drain: start\n");
1030 
1031         for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1032                 if (vdp->xdf_xb_ring == NULL)
1033                         goto out;
1034 
1035                 if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1036                         (void) xdf_intr_locked(vdp);
1037                 if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1038                         goto out;
1039                 xdf_ring_push(vdp);
1040 
1041                 /* file-backed devices can be slow */
1042                 mutex_exit(&vdp->xdf_dev_lk);
1043 #ifdef XPV_HVM_DRIVER
1044                 (void) HYPERVISOR_yield();
1045 #endif /* XPV_HVM_DRIVER */
1046                 delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1047                 mutex_enter(&vdp->xdf_dev_lk);
1048         }
1049         cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1050 
1051 out:
1052         if (vdp->xdf_xb_ring != NULL) {
1053                 if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1054                     xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1055                         rv = EIO;
1056         }
1057         if (xdf_debug & SUSRES_DBG)
1058                 xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1059                     vdp->xdf_addr, rv);
1060         return (rv);
1061 }
1062 
1063 static int
1064 xdf_ring_drain(xdf_t *vdp)
1065 {
1066         int rv;
1067         mutex_enter(&vdp->xdf_dev_lk);
1068         rv = xdf_ring_drain_locked(vdp);
1069         mutex_exit(&vdp->xdf_dev_lk);
1070         return (rv);
1071 }
1072 
1073 /*
1074  * Destroy all v_req_t, grant table entries, and our ring buffer.
1075  */
1076 static void
1077 xdf_ring_destroy(xdf_t *vdp)
1078 {
1079         v_req_t         *vreq;
1080         buf_t           *bp;
1081         ge_slot_t       *gs;
1082 
1083         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1084         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1085 
1086         if ((vdp->xdf_state != XD_INIT) &&
1087             (vdp->xdf_state != XD_CONNECTED) &&
1088             (vdp->xdf_state != XD_READY)) {
1089                 ASSERT(vdp->xdf_xb_ring == NULL);
1090                 ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1091                 ASSERT(vdp->xdf_peer == INVALID_DOMID);
1092                 ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1093                 ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1094                 return;
1095         }
1096 
1097         /*
1098          * We don't want to recieve async notifications from the backend
1099          * when it finishes processing ring entries.
1100          */
1101 #ifdef XPV_HVM_DRIVER
1102         ec_unbind_evtchn(vdp->xdf_evtchn);
1103 #else /* !XPV_HVM_DRIVER */
1104         (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1105 #endif /* !XPV_HVM_DRIVER */
1106 
1107         /*
1108          * Drain any requests in the ring.  We need to do this before we
1109          * can free grant table entries, because if active ring entries
1110          * point to grants, then the backend could be trying to access
1111          * those grants.
1112          */
1113         (void) xdf_ring_drain_locked(vdp);
1114 
1115         /* We're done talking to the backend so free up our event channel */
1116         xvdi_free_evtchn(vdp->xdf_dip);
1117         vdp->xdf_evtchn = INVALID_EVTCHN;
1118 
1119         while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1120                 bp = vreq->v_buf;
1121                 ASSERT(BP_VREQ(bp) == vreq);
1122 
1123                 /* Free up any grant table entries associaed with this IO */
1124                 while ((gs = list_head(&vreq->v_gs)) != NULL)
1125                         gs_free(gs);
1126 
1127                 /* If this IO was on the runq, move it back to the waitq. */
1128                 if (vreq->v_runq)
1129                         xdf_kstat_runq_to_waitq(vdp, bp);
1130 
1131                 /*
1132                  * Reset any buf IO state since we're going to re-issue the
1133                  * IO when we reconnect.
1134                  */
1135                 vreq_free(vdp, vreq);
1136                 BP_VREQ_SET(bp, NULL);
1137                 bioerror(bp, 0);
1138         }
1139 
1140         /* reset the active queue index pointer */
1141         vdp->xdf_i_act = vdp->xdf_f_act;
1142 
1143         /* Destroy the ring */
1144         xvdi_free_ring(vdp->xdf_xb_ring);
1145         vdp->xdf_xb_ring = NULL;
1146         vdp->xdf_xb_ring_hdl = NULL;
1147         vdp->xdf_peer = INVALID_DOMID;
1148 }
1149 
1150 void
1151 xdfmin(struct buf *bp)
1152 {
1153         if (bp->b_bcount > xdf_maxphys)
1154                 bp->b_bcount = xdf_maxphys;
1155 }
1156 
1157 /*
1158  * Check if we have a pending "eject" media request.
1159  */
1160 static int
1161 xdf_eject_pending(xdf_t *vdp)
1162 {
1163         dev_info_t      *dip = vdp->xdf_dip;
1164         char            *xsname, *str;
1165 
1166         if (!vdp->xdf_media_req_supported)
1167                 return (B_FALSE);
1168 
1169         if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1170             (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1171                 return (B_FALSE);
1172 
1173         if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1174                 strfree(str);
1175                 return (B_FALSE);
1176         }
1177         strfree(str);
1178         return (B_TRUE);
1179 }
1180 
1181 /*
1182  * Generate a media request.
1183  */
1184 static int
1185 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1186 {
1187         dev_info_t      *dip = vdp->xdf_dip;
1188         char            *xsname;
1189 
1190         /*
1191          * we can't be holding xdf_dev_lk because xenbus_printf() can
1192          * block while waiting for a PIL 1 interrupt message.  this
1193          * would cause a deadlock with xdf_intr() which needs to grab
1194          * xdf_dev_lk as well and runs at PIL 5.
1195          */
1196         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1197         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1198 
1199         if ((xsname = xvdi_get_xsname(dip)) == NULL)
1200                 return (ENXIO);
1201 
1202         /* Check if we support media requests */
1203         if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1204                 return (ENOTTY);
1205 
1206         /* If an eject is pending then don't allow any new requests */
1207         if (xdf_eject_pending(vdp))
1208                 return (ENXIO);
1209 
1210         /* Make sure that there is media present */
1211         if (media_required && (vdp->xdf_xdev_nblocks == 0))
1212                 return (ENXIO);
1213 
1214         /* We only allow operations when the device is ready and connected */
1215         if (vdp->xdf_state != XD_READY)
1216                 return (EIO);
1217 
1218         if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1219                 return (EIO);
1220 
1221         return (0);
1222 }
1223 
1224 /*
1225  * populate a single blkif_request_t w/ a buf
1226  */
1227 static void
1228 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1229 {
1230         grant_ref_t     gr;
1231         uint8_t         fsect, lsect;
1232         size_t          bcnt;
1233         paddr_t         dma_addr;
1234         off_t           blk_off;
1235         dev_info_t      *dip = vdp->xdf_dip;
1236         blkif_vdev_t    vdev = xvdi_get_vdevnum(dip);
1237         v_req_t         *vreq = BP_VREQ(bp);
1238         uint64_t        blkno = vreq->v_blkno;
1239         uint_t          ndmacs = vreq->v_ndmacs;
1240         ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1241         int             seg = 0;
1242         int             isread = IS_READ(bp);
1243         ge_slot_t       *gs = list_head(&vreq->v_gs);
1244 
1245         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1246         ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1247 
1248         if (isread)
1249                 ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1250         else {
1251                 switch (vreq->v_flush_diskcache) {
1252                 case FLUSH_DISKCACHE:
1253                         ddi_put8(acchdl, &rreq->operation,
1254                             BLKIF_OP_FLUSH_DISKCACHE);
1255                         ddi_put16(acchdl, &rreq->handle, vdev);
1256                         ddi_put64(acchdl, &rreq->id,
1257                             (uint64_t)(uintptr_t)(gs));
1258                         ddi_put8(acchdl, &rreq->nr_segments, 0);
1259                         vreq->v_status = VREQ_DMAWIN_DONE;
1260                         return;
1261                 case WRITE_BARRIER:
1262                         ddi_put8(acchdl, &rreq->operation,
1263                             BLKIF_OP_WRITE_BARRIER);
1264                         break;
1265                 default:
1266                         if (!vdp->xdf_wce)
1267                                 ddi_put8(acchdl, &rreq->operation,
1268                                     BLKIF_OP_WRITE_BARRIER);
1269                         else
1270                                 ddi_put8(acchdl, &rreq->operation,
1271                                     BLKIF_OP_WRITE);
1272                         break;
1273                 }
1274         }
1275 
1276         ddi_put16(acchdl, &rreq->handle, vdev);
1277         ddi_put64(acchdl, &rreq->sector_number, blkno);
1278         ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1279 
1280         /*
1281          * loop until all segments are populated or no more dma cookie in buf
1282          */
1283         for (;;) {
1284                 /*
1285                  * Each segment of a blkif request can transfer up to
1286                  * one 4K page of data.
1287                  */
1288                 bcnt = vreq->v_dmac.dmac_size;
1289                 dma_addr = vreq->v_dmac.dmac_laddress;
1290                 blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1291                 fsect = blk_off >> XB_BSHIFT;
1292                 lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1293 
1294                 ASSERT(bcnt <= PAGESIZE);
1295                 ASSERT((bcnt % XB_BSIZE) == 0);
1296                 ASSERT((blk_off & XB_BMASK) == 0);
1297                 ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1298                     lsect < XB_MAX_SEGLEN / XB_BSIZE);
1299 
1300                 gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1301                 ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1302                 ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1303                 ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1304 
1305                 DPRINTF(IO_DBG, (
1306                     "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1307                     vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1308                 DPRINTF(IO_DBG, (
1309                     "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1310                     vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1311 
1312                 blkno += (bcnt >> XB_BSHIFT);
1313                 seg++;
1314                 ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1315                 if (--ndmacs) {
1316                         ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1317                         continue;
1318                 }
1319 
1320                 vreq->v_status = VREQ_DMAWIN_DONE;
1321                 vreq->v_blkno = blkno;
1322                 break;
1323         }
1324         ddi_put8(acchdl,  &rreq->nr_segments, seg);
1325         DPRINTF(IO_DBG, (
1326             "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1327             vdp->xdf_addr, rreq->id));
1328 }
1329 
1330 static void
1331 xdf_io_start(xdf_t *vdp)
1332 {
1333         struct buf      *bp;
1334         v_req_t         *vreq;
1335         blkif_request_t *rreq;
1336         boolean_t       rreqready = B_FALSE;
1337 
1338         mutex_enter(&vdp->xdf_dev_lk);
1339 
1340         /*
1341          * Populate the ring request(s).  Loop until there is no buf to
1342          * transfer or no free slot available in I/O ring.
1343          */
1344         for (;;) {
1345                 /* don't start any new IO if we're suspending */
1346                 if (vdp->xdf_suspending)
1347                         break;
1348                 if ((bp = xdf_bp_next(vdp)) == NULL)
1349                         break;
1350 
1351                 /* if the buf doesn't already have a vreq, allocate one */
1352                 if (((vreq = BP_VREQ(bp)) == NULL) &&
1353                     ((vreq = vreq_get(vdp, bp)) == NULL))
1354                         break;
1355 
1356                 /* alloc DMA/GTE resources */
1357                 if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1358                         break;
1359 
1360                 /* get next blkif_request in the ring */
1361                 if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1362                         break;
1363                 bzero(rreq, sizeof (blkif_request_t));
1364                 rreqready = B_TRUE;
1365 
1366                 /* populate blkif_request with this buf */
1367                 xdf_process_rreq(vdp, bp, rreq);
1368 
1369                 /*
1370                  * This buffer/vreq pair is has been allocated a ring buffer
1371                  * resources, so if it isn't already in our runq, add it.
1372                  */
1373                 if (!vreq->v_runq)
1374                         xdf_kstat_waitq_to_runq(vdp, bp);
1375         }
1376 
1377         /* Send the request(s) to the backend */
1378         if (rreqready)
1379                 xdf_ring_push(vdp);
1380 
1381         mutex_exit(&vdp->xdf_dev_lk);
1382 }
1383 
1384 
1385 /* check if partition is open, -1 - check all partitions on the disk */
1386 static boolean_t
1387 xdf_isopen(xdf_t *vdp, int partition)
1388 {
1389         int i;
1390         ulong_t parbit;
1391         boolean_t rval = B_FALSE;
1392 
1393         ASSERT((partition == -1) ||
1394             ((partition >= 0) || (partition < XDF_PEXT)));
1395 
1396         if (partition == -1)
1397                 parbit = (ulong_t)-1;
1398         else
1399                 parbit = 1 << partition;
1400 
1401         for (i = 0; i < OTYPCNT; i++) {
1402                 if (vdp->xdf_vd_open[i] & parbit)
1403                         rval = B_TRUE;
1404         }
1405 
1406         return (rval);
1407 }
1408 
1409 /*
1410  * The connection should never be closed as long as someone is holding
1411  * us open, there is pending IO, or someone is waiting waiting for a
1412  * connection.
1413  */
1414 static boolean_t
1415 xdf_busy(xdf_t *vdp)
1416 {
1417         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1418 
1419         if ((vdp->xdf_xb_ring != NULL) &&
1420             xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1421                 ASSERT(vdp->xdf_state != XD_CLOSED);
1422                 return (B_TRUE);
1423         }
1424 
1425         if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1426                 ASSERT(vdp->xdf_state != XD_CLOSED);
1427                 return (B_TRUE);
1428         }
1429 
1430         if (xdf_isopen(vdp, -1)) {
1431                 ASSERT(vdp->xdf_state != XD_CLOSED);
1432                 return (B_TRUE);
1433         }
1434 
1435         if (vdp->xdf_connect_req > 0) {
1436                 ASSERT(vdp->xdf_state != XD_CLOSED);
1437                 return (B_TRUE);
1438         }
1439 
1440         return (B_FALSE);
1441 }
1442 
1443 static void
1444 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1445 {
1446         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1447         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1448         DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1449             vdp->xdf_addr, vdp->xdf_state, new_state));
1450         vdp->xdf_state = new_state;
1451         cv_broadcast(&vdp->xdf_dev_cv);
1452 }
1453 
1454 static void
1455 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1456 {
1457         dev_info_t      *dip = vdp->xdf_dip;
1458         boolean_t       busy;
1459 
1460         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1461         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1462         ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1463 
1464         /* Check if we're already there. */
1465         if (vdp->xdf_state == new_state)
1466                 return;
1467 
1468         mutex_enter(&vdp->xdf_dev_lk);
1469         busy = xdf_busy(vdp);
1470 
1471         /* If we're already closed then there's nothing todo. */
1472         if (vdp->xdf_state == XD_CLOSED) {
1473                 ASSERT(!busy);
1474                 xdf_set_state(vdp, new_state);
1475                 mutex_exit(&vdp->xdf_dev_lk);
1476                 return;
1477         }
1478 
1479 #ifdef DEBUG
1480         /* UhOh.  Warn the user that something bad has happened. */
1481         if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1482             (vdp->xdf_xdev_nblocks != 0)) {
1483                 cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1484                     vdp->xdf_addr);
1485         }
1486 #endif /* DEBUG */
1487 
1488         xdf_ring_destroy(vdp);
1489 
1490         /* If we're busy then we can only go into the unknown state */
1491         xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1492         mutex_exit(&vdp->xdf_dev_lk);
1493 
1494         /* if we're closed now, let the other end know */
1495         if (vdp->xdf_state == XD_CLOSED)
1496                 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1497 }
1498 
1499 
1500 /*
1501  * Kick-off connect process
1502  * Status should be XD_UNKNOWN or XD_CLOSED
1503  * On success, status will be changed to XD_INIT
1504  * On error, it will be changed to XD_UNKNOWN
1505  */
1506 static int
1507 xdf_setstate_init(xdf_t *vdp)
1508 {
1509         dev_info_t              *dip = vdp->xdf_dip;
1510         xenbus_transaction_t    xbt;
1511         grant_ref_t             gref;
1512         char                    *xsname, *str;
1513         int                     rv;
1514 
1515         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1516         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1517         ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1518             (vdp->xdf_state == XD_CLOSED));
1519 
1520         DPRINTF(DDI_DBG,
1521             ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1522 
1523         /*
1524          * If an eject is pending then don't allow a new connection.
1525          * (Only the backend can clear media request eject request.)
1526          */
1527         if (xdf_eject_pending(vdp))
1528                 return (DDI_FAILURE);
1529 
1530         if ((xsname = xvdi_get_xsname(dip)) == NULL)
1531                 goto errout;
1532 
1533         if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1534                 goto errout;
1535 
1536         (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1537 
1538         /*
1539          * Sanity check for the existance of the xenbus device-type property.
1540          * This property might not exist if we our xenbus device nodes was
1541          * force destroyed while we were still connected to the backend.
1542          */
1543         if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1544                 goto errout;
1545         strfree(str);
1546 
1547         if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1548                 goto errout;
1549 
1550         vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1551 #ifdef XPV_HVM_DRIVER
1552         ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1553 #else /* !XPV_HVM_DRIVER */
1554         if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1555             DDI_SUCCESS) {
1556                 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1557                     "failed to add intr handler", vdp->xdf_addr);
1558                 goto errout1;
1559         }
1560 #endif /* !XPV_HVM_DRIVER */
1561 
1562         if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1563             sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1564             DDI_SUCCESS) {
1565                 cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1566                     vdp->xdf_addr);
1567                 goto errout2;
1568         }
1569         vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1570 
1571         /*
1572          * Write into xenstore the info needed by backend
1573          */
1574 trans_retry:
1575         if (xenbus_transaction_start(&xbt)) {
1576                 cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1577                     vdp->xdf_addr);
1578                 xvdi_fatal_error(dip, EIO, "connect transaction init");
1579                 goto fail_trans;
1580         }
1581 
1582         /*
1583          * XBP_PROTOCOL is written by the domain builder in the case of PV
1584          * domains. However, it is not written for HVM domains, so let's
1585          * write it here.
1586          */
1587         if (((rv = xenbus_printf(xbt, xsname,
1588             XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1589             ((rv = xenbus_printf(xbt, xsname,
1590             XBP_RING_REF, "%u", gref)) != 0) ||
1591             ((rv = xenbus_printf(xbt, xsname,
1592             XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1593             ((rv = xenbus_printf(xbt, xsname,
1594             XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1595             ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1596                 (void) xenbus_transaction_end(xbt, 1);
1597                 xvdi_fatal_error(dip, rv, "connect transaction setup");
1598                 goto fail_trans;
1599         }
1600 
1601         /* kick-off connect process */
1602         if (rv = xenbus_transaction_end(xbt, 0)) {
1603                 if (rv == EAGAIN)
1604                         goto trans_retry;
1605                 xvdi_fatal_error(dip, rv, "connect transaction commit");
1606                 goto fail_trans;
1607         }
1608 
1609         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1610         mutex_enter(&vdp->xdf_dev_lk);
1611         xdf_set_state(vdp, XD_INIT);
1612         mutex_exit(&vdp->xdf_dev_lk);
1613 
1614         return (DDI_SUCCESS);
1615 
1616 fail_trans:
1617         xvdi_free_ring(vdp->xdf_xb_ring);
1618 errout2:
1619 #ifdef XPV_HVM_DRIVER
1620         ec_unbind_evtchn(vdp->xdf_evtchn);
1621 #else /* !XPV_HVM_DRIVER */
1622         (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1623 #endif /* !XPV_HVM_DRIVER */
1624 errout1:
1625         xvdi_free_evtchn(dip);
1626         vdp->xdf_evtchn = INVALID_EVTCHN;
1627 errout:
1628         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1629         cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1630             vdp->xdf_addr);
1631         return (DDI_FAILURE);
1632 }
1633 
1634 int
1635 xdf_get_flush_block(xdf_t *vdp)
1636 {
1637         /*
1638          * Get a DEV_BSIZE aligned bufer
1639          */
1640         vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1641         vdp->xdf_cache_flush_block =
1642             (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1643             (int)vdp->xdf_xdev_secsize);
1644 
1645         if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1646             xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1647                 return (DDI_FAILURE);
1648         return (DDI_SUCCESS);
1649 }
1650 
1651 static void
1652 xdf_setstate_ready(void *arg)
1653 {
1654         xdf_t   *vdp = (xdf_t *)arg;
1655 
1656         vdp->xdf_ready_tq_thread = curthread;
1657 
1658         /*
1659          * We've created all the minor nodes via cmlb_attach() using default
1660          * value in xdf_attach() to make it possible to block in xdf_open(),
1661          * in case there's anyone (say, booting thread) ever trying to open
1662          * it before connected to backend. We will refresh all those minor
1663          * nodes w/ latest info we've got now when we are almost connected.
1664          */
1665         mutex_enter(&vdp->xdf_dev_lk);
1666         if (vdp->xdf_cmbl_reattach) {
1667                 vdp->xdf_cmbl_reattach = B_FALSE;
1668 
1669                 mutex_exit(&vdp->xdf_dev_lk);
1670                 if (xdf_cmlb_attach(vdp) != 0) {
1671                         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1672                         return;
1673                 }
1674                 mutex_enter(&vdp->xdf_dev_lk);
1675         }
1676 
1677         /* If we're not still trying to get to the ready state, then bail. */
1678         if (vdp->xdf_state != XD_CONNECTED) {
1679                 mutex_exit(&vdp->xdf_dev_lk);
1680                 return;
1681         }
1682         mutex_exit(&vdp->xdf_dev_lk);
1683 
1684         /*
1685          * If backend has feature-barrier, see if it supports disk
1686          * cache flush op.
1687          */
1688         vdp->xdf_flush_supported = B_FALSE;
1689         if (vdp->xdf_feature_barrier) {
1690                 /*
1691                  * Pretend we already know flush is supported so probe
1692                  * will attempt the correct op.
1693                  */
1694                 vdp->xdf_flush_supported = B_TRUE;
1695                 if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1696                         vdp->xdf_flush_supported = B_TRUE;
1697                 } else {
1698                         vdp->xdf_flush_supported = B_FALSE;
1699                         /*
1700                          * If the other end does not support the cache flush op
1701                          * then we must use a barrier-write to force disk
1702                          * cache flushing.  Barrier writes require that a data
1703                          * block actually be written.
1704                          * Cache a block to barrier-write when we are
1705                          * asked to perform a flush.
1706                          * XXX - would it be better to just copy 1 block
1707                          * (512 bytes) from whatever write we did last
1708                          * and rewrite that block?
1709                          */
1710                         if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1711                                 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1712                                 return;
1713                         }
1714                 }
1715         }
1716 
1717         mutex_enter(&vdp->xdf_cb_lk);
1718         mutex_enter(&vdp->xdf_dev_lk);
1719         if (vdp->xdf_state == XD_CONNECTED)
1720                 xdf_set_state(vdp, XD_READY);
1721         mutex_exit(&vdp->xdf_dev_lk);
1722 
1723         /* Restart any currently queued up io */
1724         xdf_io_start(vdp);
1725 
1726         mutex_exit(&vdp->xdf_cb_lk);
1727 }
1728 
1729 /*
1730  * synthetic geometry
1731  */
1732 #define XDF_NSECTS      256
1733 #define XDF_NHEADS      16
1734 
1735 static void
1736 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1737 {
1738         xdf_t *vdp;
1739         uint_t ncyl;
1740 
1741         vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1742 
1743         ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1744 
1745         bzero(geomp, sizeof (*geomp));
1746         geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1747         geomp->g_acyl = 0;
1748         geomp->g_nhead = XDF_NHEADS;
1749         geomp->g_nsect = XDF_NSECTS;
1750         geomp->g_secsize = vdp->xdf_xdev_secsize;
1751         geomp->g_capacity = vdp->xdf_xdev_nblocks;
1752         geomp->g_intrlv = 0;
1753         geomp->g_rpm = 7200;
1754 }
1755 
1756 /*
1757  * Finish other initialization after we've connected to backend
1758  * Status should be XD_INIT before calling this routine
1759  * On success, status should be changed to XD_CONNECTED.
1760  * On error, status should stay XD_INIT
1761  */
1762 static int
1763 xdf_setstate_connected(xdf_t *vdp)
1764 {
1765         dev_info_t      *dip = vdp->xdf_dip;
1766         cmlb_geom_t     pgeom;
1767         diskaddr_t      nblocks = 0;
1768         uint_t          secsize = 0;
1769         char            *oename, *xsname, *str;
1770         uint_t          dinfo;
1771 
1772         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1773         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1774         ASSERT(vdp->xdf_state == XD_INIT);
1775 
1776         if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1777             ((oename = xvdi_get_oename(dip)) == NULL))
1778                 return (DDI_FAILURE);
1779 
1780         /* Make sure the other end is XenbusStateConnected */
1781         if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1782                 return (DDI_FAILURE);
1783 
1784         /* Determine if feature barrier is supported by backend */
1785         if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1786                 cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1787                     vdp->xdf_addr);
1788 
1789         /*
1790          * Probe backend.  Read the device size into xdf_xdev_nblocks
1791          * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1792          * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1793          * we always set VDISK_CDROM, regardless of if it's present in
1794          * the xenbus info parameter.
1795          */
1796         if (xenbus_gather(XBT_NULL, oename,
1797             XBP_SECTORS, "%"SCNu64, &nblocks,
1798             XBP_SECTOR_SIZE, "%u", &secsize,
1799             XBP_INFO, "%u", &dinfo,
1800             NULL) != 0) {
1801                 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1802                     "cannot read backend info", vdp->xdf_addr);
1803                 return (DDI_FAILURE);
1804         }
1805         if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1806                 cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1807                     vdp->xdf_addr);
1808                 return (DDI_FAILURE);
1809         }
1810         if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1811                 dinfo |= VDISK_CDROM;
1812         strfree(str);
1813 
1814         if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1815                 secsize = DEV_BSIZE;
1816         vdp->xdf_xdev_nblocks = nblocks;
1817         vdp->xdf_xdev_secsize = secsize;
1818 #ifdef _ILP32
1819         if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1820                 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1821                     "backend disk device too large with %llu blocks for"
1822                     " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1823                 xvdi_fatal_error(dip, EFBIG, "reading backend info");
1824                 return (DDI_FAILURE);
1825         }
1826 #endif
1827 
1828         /*
1829          * If the physical geometry for a fixed disk has been explicity
1830          * set then make sure that the specified physical geometry isn't
1831          * larger than the device we connected to.
1832          */
1833         if (vdp->xdf_pgeom_fixed &&
1834             (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1835                 cmn_err(CE_WARN,
1836                     "xdf@%s: connect failed, fixed geometry too large",
1837                     vdp->xdf_addr);
1838                 return (DDI_FAILURE);
1839         }
1840 
1841         vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1842 
1843         /* mark vbd is ready for I/O */
1844         mutex_enter(&vdp->xdf_dev_lk);
1845         xdf_set_state(vdp, XD_CONNECTED);
1846 
1847         /* check if the cmlb label should be updated */
1848         xdf_synthetic_pgeom(dip, &pgeom);
1849         if ((vdp->xdf_dinfo != dinfo) ||
1850             (!vdp->xdf_pgeom_fixed &&
1851             (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1852                 vdp->xdf_cmbl_reattach = B_TRUE;
1853 
1854                 vdp->xdf_dinfo = dinfo;
1855                 if (!vdp->xdf_pgeom_fixed)
1856                         vdp->xdf_pgeom = pgeom;
1857         }
1858 
1859         if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1860                 if (vdp->xdf_xdev_nblocks == 0) {
1861                         vdp->xdf_mstate = DKIO_EJECTED;
1862                         cv_broadcast(&vdp->xdf_mstate_cv);
1863                 } else {
1864                         vdp->xdf_mstate = DKIO_INSERTED;
1865                         cv_broadcast(&vdp->xdf_mstate_cv);
1866                 }
1867         } else {
1868                 if (vdp->xdf_mstate != DKIO_NONE) {
1869                         vdp->xdf_mstate = DKIO_NONE;
1870                         cv_broadcast(&vdp->xdf_mstate_cv);
1871                 }
1872         }
1873 
1874         mutex_exit(&vdp->xdf_dev_lk);
1875 
1876         cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1877             (uint64_t)vdp->xdf_xdev_nblocks);
1878 
1879         /* Restart any currently queued up io */
1880         xdf_io_start(vdp);
1881 
1882         /*
1883          * To get to the ready state we have to do IO to the backend device,
1884          * but we can't initiate IO from the other end change callback thread
1885          * (which is the current context we're executing in.)  This is because
1886          * if the other end disconnects while we're doing IO from the callback
1887          * thread, then we can't recieve that disconnect event and we hang
1888          * waiting for an IO that can never complete.
1889          */
1890         (void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1891             DDI_SLEEP);
1892 
1893         (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1894         return (DDI_SUCCESS);
1895 }
1896 
1897 /*ARGSUSED*/
1898 static void
1899 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1900 {
1901         XenbusState new_state = *(XenbusState *)impl_data;
1902         xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1903 
1904         DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1905             vdp->xdf_addr, new_state));
1906 
1907         mutex_enter(&vdp->xdf_cb_lk);
1908 
1909         /* We assume that this callback is single threaded */
1910         ASSERT(vdp->xdf_oe_change_thread == NULL);
1911         DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1912 
1913         /* ignore any backend state changes if we're suspending/suspended */
1914         if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1915                 DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1916                 mutex_exit(&vdp->xdf_cb_lk);
1917                 return;
1918         }
1919 
1920         switch (new_state) {
1921         case XenbusStateUnknown:
1922         case XenbusStateInitialising:
1923         case XenbusStateInitWait:
1924         case XenbusStateInitialised:
1925                 if (vdp->xdf_state == XD_INIT)
1926                         break;
1927 
1928                 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1929                 if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1930                         break;
1931                 ASSERT(vdp->xdf_state == XD_INIT);
1932                 break;
1933 
1934         case XenbusStateConnected:
1935                 if ((vdp->xdf_state == XD_CONNECTED) ||
1936                     (vdp->xdf_state == XD_READY))
1937                         break;
1938 
1939                 if (vdp->xdf_state != XD_INIT) {
1940                         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1941                         if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1942                                 break;
1943                         ASSERT(vdp->xdf_state == XD_INIT);
1944                 }
1945 
1946                 if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1947                         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1948                         break;
1949                 }
1950                 ASSERT(vdp->xdf_state == XD_CONNECTED);
1951                 break;
1952 
1953         case XenbusStateClosing:
1954                 if (xdf_isopen(vdp, -1)) {
1955                         cmn_err(CE_NOTE,
1956                             "xdf@%s: hot-unplug failed, still in use",
1957                             vdp->xdf_addr);
1958                         break;
1959                 }
1960                 /*FALLTHROUGH*/
1961         case XenbusStateClosed:
1962                 xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1963                 break;
1964         }
1965 
1966         /* notify anybody waiting for oe state change */
1967         cv_broadcast(&vdp->xdf_dev_cv);
1968         DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1969         mutex_exit(&vdp->xdf_cb_lk);
1970 }
1971 
1972 static int
1973 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1974 {
1975         int     rv, timeouts = 0, reset = 20;
1976 
1977         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1978         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1979 
1980         /* we can't connect once we're in the closed state */
1981         if (vdp->xdf_state == XD_CLOSED)
1982                 return (XD_CLOSED);
1983 
1984         vdp->xdf_connect_req++;
1985         while (vdp->xdf_state != XD_READY) {
1986                 mutex_exit(&vdp->xdf_dev_lk);
1987 
1988                 /* only one thread at a time can be the connection thread */
1989                 if (vdp->xdf_connect_thread == NULL)
1990                         vdp->xdf_connect_thread = curthread;
1991 
1992                 if (vdp->xdf_connect_thread == curthread) {
1993                         if ((timeouts > 0) && ((timeouts % reset) == 0)) {
1994                                 /*
1995                                  * If we haven't establised a connection
1996                                  * within the reset time, then disconnect
1997                                  * so we can try again, and double the reset
1998                                  * time.  The reset time starts at 2 sec.
1999                                  */
2000                                 (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2001                                 reset *= 2;
2002                         }
2003                         if (vdp->xdf_state == XD_UNKNOWN)
2004                                 (void) xdf_setstate_init(vdp);
2005                         if (vdp->xdf_state == XD_INIT)
2006                                 (void) xdf_setstate_connected(vdp);
2007                 }
2008 
2009                 mutex_enter(&vdp->xdf_dev_lk);
2010                 if (!wait || (vdp->xdf_state == XD_READY))
2011                         goto out;
2012 
2013                 mutex_exit((&vdp->xdf_cb_lk));
2014                 if (vdp->xdf_connect_thread != curthread) {
2015                         rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2016                 } else {
2017                         /* delay for 0.1 sec */
2018                         rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2019                             &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2020                             TR_CLOCK_TICK);
2021                         if (rv == -1)
2022                                 timeouts++;
2023                 }
2024                 mutex_exit((&vdp->xdf_dev_lk));
2025                 mutex_enter((&vdp->xdf_cb_lk));
2026                 mutex_enter((&vdp->xdf_dev_lk));
2027                 if (rv == 0)
2028                         goto out;
2029         }
2030 
2031 out:
2032         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2033         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2034 
2035         if (vdp->xdf_connect_thread == curthread) {
2036                 /*
2037                  * wake up someone else so they can become the connection
2038                  * thread.
2039                  */
2040                 cv_signal(&vdp->xdf_dev_cv);
2041                 vdp->xdf_connect_thread = NULL;
2042         }
2043 
2044         /* Try to lock the media */
2045         mutex_exit((&vdp->xdf_dev_lk));
2046         (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2047         mutex_enter((&vdp->xdf_dev_lk));
2048 
2049         vdp->xdf_connect_req--;
2050         return (vdp->xdf_state);
2051 }
2052 
2053 static uint_t
2054 xdf_iorestart(caddr_t arg)
2055 {
2056         xdf_t *vdp = (xdf_t *)arg;
2057 
2058         ASSERT(vdp != NULL);
2059 
2060         mutex_enter(&vdp->xdf_dev_lk);
2061         ASSERT(ISDMACBON(vdp));
2062         SETDMACBOFF(vdp);
2063         mutex_exit(&vdp->xdf_dev_lk);
2064 
2065         xdf_io_start(vdp);
2066 
2067         return (DDI_INTR_CLAIMED);
2068 }
2069 
2070 #if defined(XPV_HVM_DRIVER)
2071 
2072 typedef struct xdf_hvm_entry {
2073         list_node_t     xdf_he_list;
2074         char            *xdf_he_path;
2075         dev_info_t      *xdf_he_dip;
2076 } xdf_hvm_entry_t;
2077 
2078 static list_t xdf_hvm_list;
2079 static kmutex_t xdf_hvm_list_lock;
2080 
2081 static xdf_hvm_entry_t *
2082 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2083 {
2084         xdf_hvm_entry_t *i;
2085 
2086         ASSERT((path != NULL) || (dip != NULL));
2087         ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2088 
2089         i = list_head(&xdf_hvm_list);
2090         while (i != NULL) {
2091                 if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2092                         i = list_next(&xdf_hvm_list, i);
2093                         continue;
2094                 }
2095                 if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2096                         i = list_next(&xdf_hvm_list, i);
2097                         continue;
2098                 }
2099                 break;
2100         }
2101         return (i);
2102 }
2103 
2104 dev_info_t *
2105 xdf_hvm_hold(const char *path)
2106 {
2107         xdf_hvm_entry_t *i;
2108         dev_info_t      *dip;
2109 
2110         mutex_enter(&xdf_hvm_list_lock);
2111         i = i_xdf_hvm_find(path, NULL);
2112         if (i == NULL) {
2113                 mutex_exit(&xdf_hvm_list_lock);
2114                 return (B_FALSE);
2115         }
2116         ndi_hold_devi(dip = i->xdf_he_dip);
2117         mutex_exit(&xdf_hvm_list_lock);
2118         return (dip);
2119 }
2120 
2121 static void
2122 xdf_hvm_add(dev_info_t *dip)
2123 {
2124         xdf_hvm_entry_t *i;
2125         char            *path;
2126 
2127         /* figure out the path for the dip */
2128         path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2129         (void) ddi_pathname(dip, path);
2130 
2131         i = kmem_alloc(sizeof (*i), KM_SLEEP);
2132         i->xdf_he_dip = dip;
2133         i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2134 
2135         mutex_enter(&xdf_hvm_list_lock);
2136         ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2137         ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2138         list_insert_head(&xdf_hvm_list, i);
2139         mutex_exit(&xdf_hvm_list_lock);
2140 
2141         kmem_free(path, MAXPATHLEN);
2142 }
2143 
2144 static void
2145 xdf_hvm_rm(dev_info_t *dip)
2146 {
2147         xdf_hvm_entry_t *i;
2148 
2149         mutex_enter(&xdf_hvm_list_lock);
2150         VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2151         list_remove(&xdf_hvm_list, i);
2152         mutex_exit(&xdf_hvm_list_lock);
2153 
2154         kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2155         kmem_free(i, sizeof (*i));
2156 }
2157 
2158 static void
2159 xdf_hvm_init(void)
2160 {
2161         list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2162             offsetof(xdf_hvm_entry_t, xdf_he_list));
2163         mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2164 }
2165 
2166 static void
2167 xdf_hvm_fini(void)
2168 {
2169         ASSERT(list_head(&xdf_hvm_list) == NULL);
2170         list_destroy(&xdf_hvm_list);
2171         mutex_destroy(&xdf_hvm_list_lock);
2172 }
2173 
2174 boolean_t
2175 xdf_hvm_connect(dev_info_t *dip)
2176 {
2177         xdf_t   *vdp = (xdf_t *)ddi_get_driver_private(dip);
2178         char    *oename, *str;
2179         int     rv;
2180 
2181         mutex_enter(&vdp->xdf_cb_lk);
2182 
2183         /*
2184          * Before try to establish a connection we need to wait for the
2185          * backend hotplug scripts to have run.  Once they are run the
2186          * "<oename>/hotplug-status" property will be set to "connected".
2187          */
2188         for (;;) {
2189                 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2190 
2191                 /*
2192                  * Get the xenbus path to the backend device.  Note that
2193                  * we can't cache this path (and we look it up on each pass
2194                  * through this loop) because it could change during
2195                  * suspend, resume, and migration operations.
2196                  */
2197                 if ((oename = xvdi_get_oename(dip)) == NULL) {
2198                         mutex_exit(&vdp->xdf_cb_lk);
2199                         return (B_FALSE);
2200                 }
2201 
2202                 str = NULL;
2203                 if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2204                     (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2205                         break;
2206 
2207                 if (str != NULL)
2208                         strfree(str);
2209 
2210                 /* wait for an update to "<oename>/hotplug-status" */
2211                 if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2212                         /* we got interrupted by a signal */
2213                         mutex_exit(&vdp->xdf_cb_lk);
2214                         return (B_FALSE);
2215                 }
2216         }
2217 
2218         /* Good news.  The backend hotplug scripts have been run. */
2219         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2220         ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2221         strfree(str);
2222 
2223         /*
2224          * If we're emulating a cd device and if the backend doesn't support
2225          * media request opreations, then we're not going to bother trying
2226          * to establish a connection for a couple reasons.  First off, media
2227          * requests support is required to support operations like eject and
2228          * media locking.  Second, other backend platforms like Linux don't
2229          * support hvm pv cdrom access.  They don't even have a backend pv
2230          * driver for cdrom device nodes, so we don't want to block forever
2231          * waiting for a connection to a backend driver that doesn't exist.
2232          */
2233         if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2234                 mutex_exit(&vdp->xdf_cb_lk);
2235                 return (B_FALSE);
2236         }
2237 
2238         mutex_enter(&vdp->xdf_dev_lk);
2239         rv = xdf_connect_locked(vdp, B_TRUE);
2240         mutex_exit(&vdp->xdf_dev_lk);
2241         mutex_exit(&vdp->xdf_cb_lk);
2242 
2243         return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2244 }
2245 
2246 int
2247 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2248 {
2249         xdf_t   *vdp = (xdf_t *)ddi_get_driver_private(dip);
2250 
2251         /* sanity check the requested physical geometry */
2252         mutex_enter(&vdp->xdf_dev_lk);
2253         if ((geomp->g_secsize != XB_BSIZE) ||
2254             (geomp->g_capacity == 0)) {
2255                 mutex_exit(&vdp->xdf_dev_lk);
2256                 return (EINVAL);
2257         }
2258 
2259         /*
2260          * If we've already connected to the backend device then make sure
2261          * we're not defining a physical geometry larger than our backend
2262          * device.
2263          */
2264         if ((vdp->xdf_xdev_nblocks != 0) &&
2265             (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2266                 mutex_exit(&vdp->xdf_dev_lk);
2267                 return (EINVAL);
2268         }
2269 
2270         bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2271         vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2272         vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2273         vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2274         vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2275         vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2276         vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2277         vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2278         vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2279 
2280         vdp->xdf_pgeom_fixed = B_TRUE;
2281         mutex_exit(&vdp->xdf_dev_lk);
2282 
2283         /* force a re-validation */
2284         cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2285 
2286         return (0);
2287 }
2288 
2289 boolean_t
2290 xdf_is_cd(dev_info_t *dip)
2291 {
2292         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
2293         boolean_t       rv;
2294 
2295         mutex_enter(&vdp->xdf_cb_lk);
2296         rv = XD_IS_CD(vdp);
2297         mutex_exit(&vdp->xdf_cb_lk);
2298         return (rv);
2299 }
2300 
2301 boolean_t
2302 xdf_is_rm(dev_info_t *dip)
2303 {
2304         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
2305         boolean_t       rv;
2306 
2307         mutex_enter(&vdp->xdf_cb_lk);
2308         rv = XD_IS_RM(vdp);
2309         mutex_exit(&vdp->xdf_cb_lk);
2310         return (rv);
2311 }
2312 
2313 boolean_t
2314 xdf_media_req_supported(dev_info_t *dip)
2315 {
2316         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
2317         boolean_t       rv;
2318 
2319         mutex_enter(&vdp->xdf_cb_lk);
2320         rv = vdp->xdf_media_req_supported;
2321         mutex_exit(&vdp->xdf_cb_lk);
2322         return (rv);
2323 }
2324 
2325 #endif /* XPV_HVM_DRIVER */
2326 
2327 static int
2328 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2329 {
2330         xdf_t *vdp;
2331         vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2332 
2333         if (vdp == NULL)
2334                 return (ENXIO);
2335 
2336         mutex_enter(&vdp->xdf_dev_lk);
2337         *capp = vdp->xdf_pgeom.g_capacity;
2338         DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2339         mutex_exit(&vdp->xdf_dev_lk);
2340         return (0);
2341 }
2342 
2343 static int
2344 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2345 {
2346         xdf_t *vdp;
2347 
2348         if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2349                 return (ENXIO);
2350         *geomp = vdp->xdf_pgeom;
2351         return (0);
2352 }
2353 
2354 /*
2355  * No real HBA, no geometry available from it
2356  */
2357 /*ARGSUSED*/
2358 static int
2359 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2360 {
2361         return (EINVAL);
2362 }
2363 
2364 static int
2365 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2366 {
2367         xdf_t *vdp;
2368 
2369         if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2370                 return (ENXIO);
2371 
2372         if (XD_IS_RO(vdp))
2373                 tgattributep->media_is_writable = 0;
2374         else
2375                 tgattributep->media_is_writable = 1;
2376         return (0);
2377 }
2378 
2379 /* ARGSUSED3 */
2380 int
2381 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2382 {
2383         int instance;
2384         xdf_t   *vdp;
2385 
2386         instance = ddi_get_instance(dip);
2387 
2388         if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2389                 return (ENXIO);
2390 
2391         switch (cmd) {
2392         case TG_GETPHYGEOM:
2393                 return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2394         case TG_GETVIRTGEOM:
2395                 return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2396         case TG_GETCAPACITY:
2397                 return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2398         case TG_GETBLOCKSIZE:
2399                 mutex_enter(&vdp->xdf_cb_lk);
2400                 *(uint32_t *)arg = vdp->xdf_xdev_secsize;
2401                 mutex_exit(&vdp->xdf_cb_lk);
2402                 return (0);
2403         case TG_GETATTR:
2404                 return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2405         default:
2406                 return (ENOTTY);
2407         }
2408 }
2409 
2410 /* ARGSUSED5 */
2411 int
2412 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2413     diskaddr_t start, size_t reqlen, void *tg_cookie)
2414 {
2415         xdf_t *vdp;
2416         struct buf *bp;
2417         int err = 0;
2418 
2419         vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2420 
2421         /* We don't allow IO from the oe_change callback thread */
2422         ASSERT(curthread != vdp->xdf_oe_change_thread);
2423 
2424         if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2425             >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2426                 return (EINVAL);
2427 
2428         bp = getrbuf(KM_SLEEP);
2429         if (cmd == TG_READ)
2430                 bp->b_flags = B_BUSY | B_READ;
2431         else
2432                 bp->b_flags = B_BUSY | B_WRITE;
2433 
2434         bp->b_un.b_addr = bufp;
2435         bp->b_bcount = reqlen;
2436         bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2437         bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2438 
2439         mutex_enter(&vdp->xdf_dev_lk);
2440         xdf_bp_push(vdp, bp);
2441         mutex_exit(&vdp->xdf_dev_lk);
2442         xdf_io_start(vdp);
2443         if (curthread == vdp->xdf_ready_tq_thread)
2444                 (void) xdf_ring_drain(vdp);
2445         err = biowait(bp);
2446         ASSERT(bp->b_flags & B_DONE);
2447         freerbuf(bp);
2448         return (err);
2449 }
2450 
2451 /*
2452  * Lock the current media.  Set the media state to "lock".
2453  * (Media locks are only respected by the backend driver.)
2454  */
2455 static int
2456 xdf_ioctl_mlock(xdf_t *vdp)
2457 {
2458         int rv;
2459         mutex_enter(&vdp->xdf_cb_lk);
2460         rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2461         mutex_exit(&vdp->xdf_cb_lk);
2462         return (rv);
2463 }
2464 
2465 /*
2466  * Release a media lock.  Set the media state to "none".
2467  */
2468 static int
2469 xdf_ioctl_munlock(xdf_t *vdp)
2470 {
2471         int rv;
2472         mutex_enter(&vdp->xdf_cb_lk);
2473         rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2474         mutex_exit(&vdp->xdf_cb_lk);
2475         return (rv);
2476 }
2477 
2478 /*
2479  * Eject the current media.  Ignores any media locks.  (Media locks
2480  * are only for benifit of the the backend.)
2481  */
2482 static int
2483 xdf_ioctl_eject(xdf_t *vdp)
2484 {
2485         int rv;
2486 
2487         mutex_enter(&vdp->xdf_cb_lk);
2488         if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2489                 mutex_exit(&vdp->xdf_cb_lk);
2490                 return (rv);
2491         }
2492 
2493         /*
2494          * We've set the media requests xenbus parameter to eject, so now
2495          * disconnect from the backend, wait for the backend to clear
2496          * the media requets xenbus paramter, and then we can reconnect
2497          * to the backend.
2498          */
2499         (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2500         mutex_enter(&vdp->xdf_dev_lk);
2501         if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2502                 mutex_exit(&vdp->xdf_dev_lk);
2503                 mutex_exit(&vdp->xdf_cb_lk);
2504                 return (EIO);
2505         }
2506         mutex_exit(&vdp->xdf_dev_lk);
2507         mutex_exit(&vdp->xdf_cb_lk);
2508         return (0);
2509 }
2510 
2511 /*
2512  * Watch for media state changes.  This can be an insertion of a device
2513  * (triggered by a 'xm block-configure' request in another domain) or
2514  * the ejection of a device (triggered by a local "eject" operation).
2515  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2516  */
2517 static int
2518 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2519 {
2520         enum dkio_state         prev_state;
2521 
2522         mutex_enter(&vdp->xdf_cb_lk);
2523         prev_state = vdp->xdf_mstate;
2524 
2525         if (vdp->xdf_mstate == mstate) {
2526                 while (vdp->xdf_mstate == prev_state) {
2527                         if (cv_wait_sig(&vdp->xdf_mstate_cv,
2528                             &vdp->xdf_cb_lk) == 0) {
2529                                 mutex_exit(&vdp->xdf_cb_lk);
2530                                 return (EINTR);
2531                         }
2532                 }
2533         }
2534 
2535         if ((prev_state != DKIO_INSERTED) &&
2536             (vdp->xdf_mstate == DKIO_INSERTED)) {
2537                 (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2538                 mutex_exit(&vdp->xdf_cb_lk);
2539                 return (0);
2540         }
2541 
2542         mutex_exit(&vdp->xdf_cb_lk);
2543         return (0);
2544 }
2545 
2546 /*ARGSUSED*/
2547 static int
2548 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2549     int *rvalp)
2550 {
2551         minor_t         minor = getminor(dev);
2552         int             part = XDF_PART(minor);
2553         xdf_t           *vdp;
2554         int             rv;
2555 
2556         if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2557             (!xdf_isopen(vdp, part)))
2558                 return (ENXIO);
2559 
2560         DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2561             vdp->xdf_addr, cmd, cmd));
2562 
2563         switch (cmd) {
2564         default:
2565                 return (ENOTTY);
2566         case DKIOCG_PHYGEOM:
2567         case DKIOCG_VIRTGEOM:
2568         case DKIOCGGEOM:
2569         case DKIOCSGEOM:
2570         case DKIOCGAPART:
2571         case DKIOCSAPART:
2572         case DKIOCGVTOC:
2573         case DKIOCSVTOC:
2574         case DKIOCPARTINFO:
2575         case DKIOCGEXTVTOC:
2576         case DKIOCSEXTVTOC:
2577         case DKIOCEXTPARTINFO:
2578         case DKIOCGMBOOT:
2579         case DKIOCSMBOOT:
2580         case DKIOCGETEFI:
2581         case DKIOCSETEFI:
2582         case DKIOCSETEXTPART:
2583         case DKIOCPARTITION:
2584                 return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2585                     rvalp, NULL));
2586         case FDEJECT:
2587         case DKIOCEJECT:
2588         case CDROMEJECT:
2589                 return (xdf_ioctl_eject(vdp));
2590         case DKIOCLOCK:
2591                 return (xdf_ioctl_mlock(vdp));
2592         case DKIOCUNLOCK:
2593                 return (xdf_ioctl_munlock(vdp));
2594         case CDROMREADOFFSET: {
2595                 int offset = 0;
2596                 if (!XD_IS_CD(vdp))
2597                         return (ENOTTY);
2598                 if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2599                         return (EFAULT);
2600                 return (0);
2601         }
2602         case DKIOCGMEDIAINFO: {
2603                 struct dk_minfo media_info;
2604 
2605                 media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2606                 media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2607                 if (XD_IS_CD(vdp))
2608                         media_info.dki_media_type = DK_CDROM;
2609                 else
2610                         media_info.dki_media_type = DK_FIXED_DISK;
2611 
2612                 if (ddi_copyout(&media_info, (void *)arg,
2613                     sizeof (struct dk_minfo), mode))
2614                         return (EFAULT);
2615                 return (0);
2616         }
2617         case DKIOCINFO: {
2618                 struct dk_cinfo info;
2619 
2620                 /* controller information */
2621                 if (XD_IS_CD(vdp))
2622                         info.dki_ctype = DKC_CDROM;
2623                 else
2624                         info.dki_ctype = DKC_VBD;
2625 
2626                 info.dki_cnum = 0;
2627                 (void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2628 
2629                 /* unit information */
2630                 info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2631                 (void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2632                 info.dki_flags = DKI_FMTVOL;
2633                 info.dki_partition = part;
2634                 info.dki_maxtransfer = maxphys / DEV_BSIZE;
2635                 info.dki_addr = 0;
2636                 info.dki_space = 0;
2637                 info.dki_prio = 0;
2638                 info.dki_vec = 0;
2639 
2640                 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2641                         return (EFAULT);
2642                 return (0);
2643         }
2644         case DKIOCSTATE: {
2645                 enum dkio_state mstate;
2646 
2647                 if (ddi_copyin((void *)arg, &mstate,
2648                     sizeof (mstate), mode) != 0)
2649                         return (EFAULT);
2650                 if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2651                         return (rv);
2652                 mstate = vdp->xdf_mstate;
2653                 if (ddi_copyout(&mstate, (void *)arg,
2654                     sizeof (mstate), mode) != 0)
2655                         return (EFAULT);
2656                 return (0);
2657         }
2658         case DKIOCREMOVABLE: {
2659                 int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2660                 if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2661                         return (EFAULT);
2662                 return (0);
2663         }
2664         case DKIOCGETWCE: {
2665                 int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2666                 if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2667                         return (EFAULT);
2668                 return (0);
2669         }
2670         case DKIOCSETWCE: {
2671                 int i;
2672                 if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2673                         return (EFAULT);
2674                 vdp->xdf_wce = VOID2BOOLEAN(i);
2675                 return (0);
2676         }
2677         case DKIOCFLUSHWRITECACHE: {
2678                 struct dk_callback *dkc = (struct dk_callback *)arg;
2679 
2680                 if (vdp->xdf_flush_supported) {
2681                         rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2682                             NULL, 0, 0, (void *)dev);
2683                 } else if (vdp->xdf_feature_barrier &&
2684                     !xdf_barrier_flush_disable) {
2685                         rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2686                             vdp->xdf_cache_flush_block, xdf_flush_block,
2687                             vdp->xdf_xdev_secsize, (void *)dev);
2688                 } else {
2689                         return (ENOTTY);
2690                 }
2691                 if ((mode & FKIOCTL) && (dkc != NULL) &&
2692                     (dkc->dkc_callback != NULL)) {
2693                         (*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2694                         /* need to return 0 after calling callback */
2695                         rv = 0;
2696                 }
2697                 return (rv);
2698         }
2699         }
2700         /*NOTREACHED*/
2701 }
2702 
2703 static int
2704 xdf_strategy(struct buf *bp)
2705 {
2706         xdf_t   *vdp;
2707         minor_t minor;
2708         diskaddr_t p_blkct, p_blkst;
2709         daddr_t blkno;
2710         ulong_t nblks;
2711         int part;
2712 
2713         minor = getminor(bp->b_edev);
2714         part = XDF_PART(minor);
2715         vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2716 
2717         mutex_enter(&vdp->xdf_dev_lk);
2718         if (!xdf_isopen(vdp, part)) {
2719                 mutex_exit(&vdp->xdf_dev_lk);
2720                 xdf_io_err(bp, ENXIO, 0);
2721                 return (0);
2722         }
2723 
2724         /* We don't allow IO from the oe_change callback thread */
2725         ASSERT(curthread != vdp->xdf_oe_change_thread);
2726 
2727         /* Check for writes to a read only device */
2728         if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2729                 mutex_exit(&vdp->xdf_dev_lk);
2730                 xdf_io_err(bp, EROFS, 0);
2731                 return (0);
2732         }
2733 
2734         /* Check if this I/O is accessing a partition or the entire disk */
2735         if ((long)bp->b_private == XB_SLICE_NONE) {
2736                 /* This I/O is using an absolute offset */
2737                 p_blkct = vdp->xdf_xdev_nblocks;
2738                 p_blkst = 0;
2739         } else {
2740                 /* This I/O is using a partition relative offset */
2741                 mutex_exit(&vdp->xdf_dev_lk);
2742                 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2743                     &p_blkst, NULL, NULL, NULL)) {
2744                         xdf_io_err(bp, ENXIO, 0);
2745                         return (0);
2746                 }
2747                 mutex_enter(&vdp->xdf_dev_lk);
2748         }
2749 
2750         /*
2751          * Adjust the real blkno and bcount according to the underline
2752          * physical sector size.
2753          */
2754         blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2755 
2756         /* check for a starting block beyond the disk or partition limit */
2757         if (blkno > p_blkct) {
2758                 DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2759                     vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2760                 mutex_exit(&vdp->xdf_dev_lk);
2761                 xdf_io_err(bp, EINVAL, 0);
2762                 return (0);
2763         }
2764 
2765         /* Legacy: don't set error flag at this case */
2766         if (blkno == p_blkct) {
2767                 mutex_exit(&vdp->xdf_dev_lk);
2768                 bp->b_resid = bp->b_bcount;
2769                 biodone(bp);
2770                 return (0);
2771         }
2772 
2773         /* sanitize the input buf */
2774         bioerror(bp, 0);
2775         bp->b_resid = 0;
2776         bp->av_back = bp->av_forw = NULL;
2777 
2778         /* Adjust for partial transfer, this will result in an error later */
2779         if (vdp->xdf_xdev_secsize != 0 &&
2780             vdp->xdf_xdev_secsize != XB_BSIZE) {
2781                 nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2782         } else {
2783                 nblks = bp->b_bcount >> XB_BSHIFT;
2784         }
2785 
2786         if ((blkno + nblks) > p_blkct) {
2787                 if (vdp->xdf_xdev_secsize != 0 &&
2788                     vdp->xdf_xdev_secsize != XB_BSIZE) {
2789                         bp->b_resid =
2790                             ((blkno + nblks) - p_blkct) *
2791                             vdp->xdf_xdev_secsize;
2792                 } else {
2793                         bp->b_resid =
2794                             ((blkno + nblks) - p_blkct) <<
2795                             XB_BSHIFT;
2796                 }
2797                 bp->b_bcount -= bp->b_resid;
2798         }
2799 
2800         DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2801             vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2802 
2803         /* Fix up the buf struct */
2804         bp->b_flags |= B_BUSY;
2805         bp->b_private = (void *)(uintptr_t)p_blkst;
2806 
2807         xdf_bp_push(vdp, bp);
2808         mutex_exit(&vdp->xdf_dev_lk);
2809         xdf_io_start(vdp);
2810         if (do_polled_io)
2811                 (void) xdf_ring_drain(vdp);
2812         return (0);
2813 }
2814 
2815 /*ARGSUSED*/
2816 static int
2817 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2818 {
2819         xdf_t   *vdp;
2820         minor_t minor;
2821         diskaddr_t p_blkcnt;
2822         int part;
2823 
2824         minor = getminor(dev);
2825         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2826                 return (ENXIO);
2827 
2828         DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2829             vdp->xdf_addr, (int64_t)uiop->uio_offset));
2830 
2831         part = XDF_PART(minor);
2832         if (!xdf_isopen(vdp, part))
2833                 return (ENXIO);
2834 
2835         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2836             NULL, NULL, NULL, NULL))
2837                 return (ENXIO);
2838 
2839         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2840                 return (ENOSPC);
2841 
2842         if (U_INVAL(uiop))
2843                 return (EINVAL);
2844 
2845         return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2846 }
2847 
2848 /*ARGSUSED*/
2849 static int
2850 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2851 {
2852         xdf_t *vdp;
2853         minor_t minor;
2854         diskaddr_t p_blkcnt;
2855         int part;
2856 
2857         minor = getminor(dev);
2858         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2859                 return (ENXIO);
2860 
2861         DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2862             vdp->xdf_addr, (int64_t)uiop->uio_offset));
2863 
2864         part = XDF_PART(minor);
2865         if (!xdf_isopen(vdp, part))
2866                 return (ENXIO);
2867 
2868         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2869             NULL, NULL, NULL, NULL))
2870                 return (ENXIO);
2871 
2872         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2873                 return (ENOSPC);
2874 
2875         if (U_INVAL(uiop))
2876                 return (EINVAL);
2877 
2878         return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2879 }
2880 
2881 /*ARGSUSED*/
2882 static int
2883 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2884 {
2885         xdf_t   *vdp;
2886         minor_t minor;
2887         struct uio *uiop = aiop->aio_uio;
2888         diskaddr_t p_blkcnt;
2889         int part;
2890 
2891         minor = getminor(dev);
2892         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2893                 return (ENXIO);
2894 
2895         part = XDF_PART(minor);
2896         if (!xdf_isopen(vdp, part))
2897                 return (ENXIO);
2898 
2899         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2900             NULL, NULL, NULL, NULL))
2901                 return (ENXIO);
2902 
2903         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2904                 return (ENOSPC);
2905 
2906         if (U_INVAL(uiop))
2907                 return (EINVAL);
2908 
2909         return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2910 }
2911 
2912 /*ARGSUSED*/
2913 static int
2914 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2915 {
2916         xdf_t *vdp;
2917         minor_t minor;
2918         struct uio *uiop = aiop->aio_uio;
2919         diskaddr_t p_blkcnt;
2920         int part;
2921 
2922         minor = getminor(dev);
2923         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2924                 return (ENXIO);
2925 
2926         part = XDF_PART(minor);
2927         if (!xdf_isopen(vdp, part))
2928                 return (ENXIO);
2929 
2930         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2931             NULL, NULL, NULL, NULL))
2932                 return (ENXIO);
2933 
2934         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2935                 return (ENOSPC);
2936 
2937         if (U_INVAL(uiop))
2938                 return (EINVAL);
2939 
2940         return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2941 }
2942 
2943 static int
2944 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2945 {
2946         struct buf dumpbuf, *dbp = &dumpbuf;
2947         xdf_t   *vdp;
2948         minor_t minor;
2949         int err = 0;
2950         int part;
2951         diskaddr_t p_blkcnt, p_blkst;
2952 
2953         minor = getminor(dev);
2954         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2955                 return (ENXIO);
2956 
2957         DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2958             vdp->xdf_addr, (void *)addr, blkno, nblk));
2959 
2960         /* We don't allow IO from the oe_change callback thread */
2961         ASSERT(curthread != vdp->xdf_oe_change_thread);
2962 
2963         part = XDF_PART(minor);
2964         if (!xdf_isopen(vdp, part))
2965                 return (ENXIO);
2966 
2967         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
2968             NULL, NULL, NULL))
2969                 return (ENXIO);
2970 
2971         if ((blkno + nblk) >
2972             (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
2973                 cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
2974                     vdp->xdf_addr, (daddr_t)((blkno + nblk) /
2975                     (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
2976                 return (EINVAL);
2977         }
2978 
2979         bioinit(dbp);
2980         dbp->b_flags = B_BUSY;
2981         dbp->b_un.b_addr = addr;
2982         dbp->b_bcount = nblk << DEV_BSHIFT;
2983         dbp->b_blkno = blkno;
2984         dbp->b_edev = dev;
2985         dbp->b_private = (void *)(uintptr_t)p_blkst;
2986 
2987         mutex_enter(&vdp->xdf_dev_lk);
2988         xdf_bp_push(vdp, dbp);
2989         mutex_exit(&vdp->xdf_dev_lk);
2990         xdf_io_start(vdp);
2991         err = xdf_ring_drain(vdp);
2992         biofini(dbp);
2993         return (err);
2994 }
2995 
2996 /*ARGSUSED*/
2997 static int
2998 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
2999 {
3000         minor_t minor;
3001         xdf_t   *vdp;
3002         int part;
3003         ulong_t parbit;
3004 
3005         minor = getminor(dev);
3006         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3007                 return (ENXIO);
3008 
3009         mutex_enter(&vdp->xdf_dev_lk);
3010         part = XDF_PART(minor);
3011         if (!xdf_isopen(vdp, part)) {
3012                 mutex_exit(&vdp->xdf_dev_lk);
3013                 return (ENXIO);
3014         }
3015         parbit = 1 << part;
3016 
3017         ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3018         if (otyp == OTYP_LYR) {
3019                 ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3020                 if (--vdp->xdf_vd_lyropen[part] == 0)
3021                         vdp->xdf_vd_open[otyp] &= ~parbit;
3022         } else {
3023                 vdp->xdf_vd_open[otyp] &= ~parbit;
3024         }
3025         vdp->xdf_vd_exclopen &= ~parbit;
3026 
3027         mutex_exit(&vdp->xdf_dev_lk);
3028         return (0);
3029 }
3030 
3031 static int
3032 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3033 {
3034         minor_t minor;
3035         xdf_t   *vdp;
3036         int part;
3037         ulong_t parbit;
3038         diskaddr_t p_blkct = 0;
3039         boolean_t firstopen;
3040         boolean_t nodelay;
3041 
3042         minor = getminor(*devp);
3043         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3044                 return (ENXIO);
3045 
3046         nodelay = (flag & (FNDELAY | FNONBLOCK));
3047 
3048         DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3049 
3050         /* do cv_wait until connected or failed */
3051         mutex_enter(&vdp->xdf_cb_lk);
3052         mutex_enter(&vdp->xdf_dev_lk);
3053         if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3054                 mutex_exit(&vdp->xdf_dev_lk);
3055                 mutex_exit(&vdp->xdf_cb_lk);
3056                 return (ENXIO);
3057         }
3058         mutex_exit(&vdp->xdf_cb_lk);
3059 
3060         if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3061                 mutex_exit(&vdp->xdf_dev_lk);
3062                 return (EROFS);
3063         }
3064 
3065         part = XDF_PART(minor);
3066         parbit = 1 << part;
3067         if ((vdp->xdf_vd_exclopen & parbit) ||
3068             ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3069                 mutex_exit(&vdp->xdf_dev_lk);
3070                 return (EBUSY);
3071         }
3072 
3073         /* are we the first one to open this node? */
3074         firstopen = !xdf_isopen(vdp, -1);
3075 
3076         if (otyp == OTYP_LYR)
3077                 vdp->xdf_vd_lyropen[part]++;
3078 
3079         vdp->xdf_vd_open[otyp] |= parbit;
3080 
3081         if (flag & FEXCL)
3082                 vdp->xdf_vd_exclopen |= parbit;
3083 
3084         mutex_exit(&vdp->xdf_dev_lk);
3085 
3086         /* force a re-validation */
3087         if (firstopen)
3088                 cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3089 
3090         /* If this is a non-blocking open then we're done */
3091         if (nodelay)
3092                 return (0);
3093 
3094         /*
3095          * This is a blocking open, so we require:
3096          * - that the disk have a valid label on it
3097          * - that the size of the partition that we're opening is non-zero
3098          */
3099         if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3100             NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3101                 (void) xdf_close(*devp, flag, otyp, credp);
3102                 return (ENXIO);
3103         }
3104 
3105         return (0);
3106 }
3107 
3108 /*ARGSUSED*/
3109 static void
3110 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3111 {
3112         xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3113         cv_broadcast(&vdp->xdf_hp_status_cv);
3114 }
3115 
3116 static int
3117 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3118         char *name, caddr_t valuep, int *lengthp)
3119 {
3120         xdf_t   *vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3121 
3122         /*
3123          * Sanity check that if a dev_t or dip were specified that they
3124          * correspond to this device driver.  On debug kernels we'll
3125          * panic and on non-debug kernels we'll return failure.
3126          */
3127         ASSERT(ddi_driver_major(dip) == xdf_major);
3128         ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3129         if ((ddi_driver_major(dip) != xdf_major) ||
3130             ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3131                 return (DDI_PROP_NOT_FOUND);
3132 
3133         if (vdp == NULL)
3134                 return (ddi_prop_op(dev, dip, prop_op, flags,
3135                     name, valuep, lengthp));
3136 
3137         return (cmlb_prop_op(vdp->xdf_vd_lbl,
3138             dev, dip, prop_op, flags, name, valuep, lengthp,
3139             XDF_PART(getminor(dev)), NULL));
3140 }
3141 
3142 /*ARGSUSED*/
3143 static int
3144 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3145 {
3146         int     instance = XDF_INST(getminor((dev_t)arg));
3147         xdf_t   *vbdp;
3148 
3149         switch (cmd) {
3150         case DDI_INFO_DEVT2DEVINFO:
3151                 if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3152                         *rp = NULL;
3153                         return (DDI_FAILURE);
3154                 }
3155                 *rp = vbdp->xdf_dip;
3156                 return (DDI_SUCCESS);
3157 
3158         case DDI_INFO_DEVT2INSTANCE:
3159                 *rp = (void *)(uintptr_t)instance;
3160                 return (DDI_SUCCESS);
3161 
3162         default:
3163                 return (DDI_FAILURE);
3164         }
3165 }
3166 
3167 /*ARGSUSED*/
3168 static int
3169 xdf_resume(dev_info_t *dip)
3170 {
3171         xdf_t   *vdp;
3172         char    *oename;
3173 
3174         if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3175                 goto err;
3176 
3177         if (xdf_debug & SUSRES_DBG)
3178                 xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3179 
3180         mutex_enter(&vdp->xdf_cb_lk);
3181 
3182         if (xvdi_resume(dip) != DDI_SUCCESS) {
3183                 mutex_exit(&vdp->xdf_cb_lk);
3184                 goto err;
3185         }
3186 
3187         if (((oename = xvdi_get_oename(dip)) == NULL) ||
3188             (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3189             xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3190                 mutex_exit(&vdp->xdf_cb_lk);
3191                 goto err;
3192         }
3193 
3194         mutex_enter(&vdp->xdf_dev_lk);
3195         ASSERT(vdp->xdf_state != XD_READY);
3196         xdf_set_state(vdp, XD_UNKNOWN);
3197         mutex_exit(&vdp->xdf_dev_lk);
3198 
3199         if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3200                 mutex_exit(&vdp->xdf_cb_lk);
3201                 goto err;
3202         }
3203 
3204         mutex_exit(&vdp->xdf_cb_lk);
3205 
3206         if (xdf_debug & SUSRES_DBG)
3207                 xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3208         return (DDI_SUCCESS);
3209 err:
3210         if (xdf_debug & SUSRES_DBG)
3211                 xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3212         return (DDI_FAILURE);
3213 }
3214 
3215 static int
3216 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3217 {
3218         int                     n, instance = ddi_get_instance(dip);
3219         ddi_iblock_cookie_t     ibc, softibc;
3220         boolean_t               dev_iscd = B_FALSE;
3221         xdf_t                   *vdp;
3222         char                    *oename, *xsname, *str;
3223 
3224         if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3225             "xdf_debug", 0)) != 0)
3226                 xdf_debug = n;
3227 
3228         switch (cmd) {
3229         case DDI_RESUME:
3230                 return (xdf_resume(dip));
3231         case DDI_ATTACH:
3232                 break;
3233         default:
3234                 return (DDI_FAILURE);
3235         }
3236         /* DDI_ATTACH */
3237 
3238         if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
3239             ((oename = xvdi_get_oename(dip)) == NULL))
3240                 return (DDI_FAILURE);
3241 
3242         /*
3243          * Disable auto-detach.  This is necessary so that we don't get
3244          * detached while we're disconnected from the back end.
3245          */
3246         if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3247             DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3248                 return (DDI_FAILURE);
3249 
3250         /* driver handles kernel-issued IOCTLs */
3251         if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3252             DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3253                 return (DDI_FAILURE);
3254 
3255         if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3256                 return (DDI_FAILURE);
3257 
3258         if (ddi_get_soft_iblock_cookie(dip,
3259             DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3260                 return (DDI_FAILURE);
3261 
3262         if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3263                 cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3264                     ddi_get_name_addr(dip));
3265                 return (DDI_FAILURE);
3266         }
3267         if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3268                 dev_iscd = B_TRUE;
3269         strfree(str);
3270 
3271         if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3272                 return (DDI_FAILURE);
3273 
3274         DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3275         vdp = ddi_get_soft_state(xdf_ssp, instance);
3276         ddi_set_driver_private(dip, vdp);
3277         vdp->xdf_dip = dip;
3278         vdp->xdf_addr = ddi_get_name_addr(dip);
3279         vdp->xdf_suspending = B_FALSE;
3280         vdp->xdf_media_req_supported = B_FALSE;
3281         vdp->xdf_peer = INVALID_DOMID;
3282         vdp->xdf_evtchn = INVALID_EVTCHN;
3283         list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3284             offsetof(v_req_t, v_link));
3285         cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3286         cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3287         cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3288         mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3289         mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3290         mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3291         vdp->xdf_cmbl_reattach = B_TRUE;
3292         if (dev_iscd) {
3293                 vdp->xdf_dinfo |= VDISK_CDROM;
3294                 vdp->xdf_mstate = DKIO_EJECTED;
3295         } else {
3296                 vdp->xdf_mstate = DKIO_NONE;
3297         }
3298 
3299         if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3300             1, TASKQ_DEFAULTPRI, 0)) == NULL)
3301                 goto errout0;
3302 
3303         if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3304             xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3305                 goto errout0;
3306 
3307         if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3308             &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3309                 cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3310                     ddi_get_name_addr(dip));
3311                 goto errout0;
3312         }
3313 
3314         /*
3315          * Initialize the physical geometry stucture.  Note that currently
3316          * we don't know the size of the backend device so the number
3317          * of blocks on the device will be initialized to zero.  Once
3318          * we connect to the backend device we'll update the physical
3319          * geometry to reflect the real size of the device.
3320          */
3321         xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3322         vdp->xdf_pgeom_fixed = B_FALSE;
3323 
3324         /*
3325          * create default device minor nodes: non-removable disk
3326          * we will adjust minor nodes after we are connected w/ backend
3327          */
3328         cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3329         if (xdf_cmlb_attach(vdp) != 0) {
3330                 cmn_err(CE_WARN,
3331                     "xdf@%s: attach failed, cmlb attach failed",
3332                     ddi_get_name_addr(dip));
3333                 goto errout0;
3334         }
3335 
3336         /*
3337          * We ship with cache-enabled disks
3338          */
3339         vdp->xdf_wce = B_TRUE;
3340 
3341         mutex_enter(&vdp->xdf_cb_lk);
3342         /* Watch backend XenbusState change */
3343         if (xvdi_add_event_handler(dip,
3344             XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3345                 mutex_exit(&vdp->xdf_cb_lk);
3346                 goto errout0;
3347         }
3348 
3349         if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3350                 cmn_err(CE_WARN, "xdf@%s: start connection failed",
3351                     ddi_get_name_addr(dip));
3352                 mutex_exit(&vdp->xdf_cb_lk);
3353                 goto errout1;
3354         }
3355         mutex_exit(&vdp->xdf_cb_lk);
3356 
3357 #if defined(XPV_HVM_DRIVER)
3358 
3359         xdf_hvm_add(dip);
3360 
3361         /* Report our version to dom0.  */
3362         if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3363             HVMPV_XDF_VERS))
3364                 cmn_err(CE_WARN, "xdf: couldn't write version\n");
3365 
3366 #else /* !XPV_HVM_DRIVER */
3367 
3368         /* create kstat for iostat(1M) */
3369         if (xdf_kstat_create(dip, "xdf", instance) != 0) {
3370                 cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3371                     ddi_get_name_addr(dip));
3372                 goto errout1;
3373         }
3374 
3375 #endif /* !XPV_HVM_DRIVER */
3376 
3377         ddi_report_dev(dip);
3378         DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3379         return (DDI_SUCCESS);
3380 
3381 errout1:
3382         (void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3383         xvdi_remove_event_handler(dip, XS_OE_STATE);
3384 errout0:
3385         if (vdp->xdf_vd_lbl != NULL) {
3386                 cmlb_detach(vdp->xdf_vd_lbl, NULL);
3387                 cmlb_free_handle(&vdp->xdf_vd_lbl);
3388                 vdp->xdf_vd_lbl = NULL;
3389         }
3390         if (vdp->xdf_softintr_id != NULL)
3391                 ddi_remove_softintr(vdp->xdf_softintr_id);
3392         xvdi_remove_xb_watch_handlers(dip);
3393         if (vdp->xdf_ready_tq != NULL)
3394                 ddi_taskq_destroy(vdp->xdf_ready_tq);
3395         mutex_destroy(&vdp->xdf_cb_lk);
3396         mutex_destroy(&vdp->xdf_dev_lk);
3397         cv_destroy(&vdp->xdf_dev_cv);
3398         cv_destroy(&vdp->xdf_hp_status_cv);
3399         ddi_soft_state_free(xdf_ssp, instance);
3400         ddi_set_driver_private(dip, NULL);
3401         ddi_prop_remove_all(dip);
3402         cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3403         return (DDI_FAILURE);
3404 }
3405 
3406 static int
3407 xdf_suspend(dev_info_t *dip)
3408 {
3409         int             instance = ddi_get_instance(dip);
3410         xdf_t           *vdp;
3411 
3412         if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3413                 return (DDI_FAILURE);
3414 
3415         if (xdf_debug & SUSRES_DBG)
3416                 xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3417 
3418         xvdi_suspend(dip);
3419 
3420         mutex_enter(&vdp->xdf_cb_lk);
3421         mutex_enter(&vdp->xdf_dev_lk);
3422 
3423         vdp->xdf_suspending = B_TRUE;
3424         xdf_ring_destroy(vdp);
3425         xdf_set_state(vdp, XD_SUSPEND);
3426         vdp->xdf_suspending = B_FALSE;
3427 
3428         mutex_exit(&vdp->xdf_dev_lk);
3429         mutex_exit(&vdp->xdf_cb_lk);
3430 
3431         if (xdf_debug & SUSRES_DBG)
3432                 xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3433 
3434         return (DDI_SUCCESS);
3435 }
3436 
3437 static int
3438 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3439 {
3440         xdf_t *vdp;
3441         int instance;
3442 
3443         switch (cmd) {
3444 
3445         case DDI_PM_SUSPEND:
3446                 break;
3447 
3448         case DDI_SUSPEND:
3449                 return (xdf_suspend(dip));
3450 
3451         case DDI_DETACH:
3452                 break;
3453 
3454         default:
3455                 return (DDI_FAILURE);
3456         }
3457 
3458         instance = ddi_get_instance(dip);
3459         DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3460         vdp = ddi_get_soft_state(xdf_ssp, instance);
3461 
3462         if (vdp == NULL)
3463                 return (DDI_FAILURE);
3464 
3465         mutex_enter(&vdp->xdf_cb_lk);
3466         xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3467         if (vdp->xdf_state != XD_CLOSED) {
3468                 mutex_exit(&vdp->xdf_cb_lk);
3469                 return (DDI_FAILURE);
3470         }
3471         mutex_exit(&vdp->xdf_cb_lk);
3472 
3473         ASSERT(!ISDMACBON(vdp));
3474 
3475 #if defined(XPV_HVM_DRIVER)
3476         xdf_hvm_rm(dip);
3477 #endif /* XPV_HVM_DRIVER */
3478 
3479         if (vdp->xdf_timeout_id != 0)
3480                 (void) untimeout(vdp->xdf_timeout_id);
3481 
3482         xvdi_remove_event_handler(dip, XS_OE_STATE);
3483         ddi_taskq_destroy(vdp->xdf_ready_tq);
3484 
3485         cmlb_detach(vdp->xdf_vd_lbl, NULL);
3486         cmlb_free_handle(&vdp->xdf_vd_lbl);
3487 
3488         /* we'll support backend running in domU later */
3489 #ifdef  DOMU_BACKEND
3490         (void) xvdi_post_event(dip, XEN_HP_REMOVE);
3491 #endif
3492 
3493         list_destroy(&vdp->xdf_vreq_act);
3494         ddi_prop_remove_all(dip);
3495         xdf_kstat_delete(dip);
3496         ddi_remove_softintr(vdp->xdf_softintr_id);
3497         xvdi_remove_xb_watch_handlers(dip);
3498         ddi_set_driver_private(dip, NULL);
3499         cv_destroy(&vdp->xdf_dev_cv);
3500         mutex_destroy(&vdp->xdf_cb_lk);
3501         mutex_destroy(&vdp->xdf_dev_lk);
3502         if (vdp->xdf_cache_flush_block != NULL)
3503                 kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3504         ddi_soft_state_free(xdf_ssp, instance);
3505         return (DDI_SUCCESS);
3506 }
3507 
3508 /*
3509  * Driver linkage structures.
3510  */
3511 static struct cb_ops xdf_cbops = {
3512         xdf_open,
3513         xdf_close,
3514         xdf_strategy,
3515         nodev,
3516         xdf_dump,
3517         xdf_read,
3518         xdf_write,
3519         xdf_ioctl,
3520         nodev,
3521         nodev,
3522         nodev,
3523         nochpoll,
3524         xdf_prop_op,
3525         NULL,
3526         D_MP | D_NEW | D_64BIT,
3527         CB_REV,
3528         xdf_aread,
3529         xdf_awrite
3530 };
3531 
3532 struct dev_ops xdf_devops = {
3533         DEVO_REV,               /* devo_rev */
3534         0,                      /* devo_refcnt */
3535         xdf_getinfo,            /* devo_getinfo */
3536         nulldev,                /* devo_identify */
3537         nulldev,                /* devo_probe */
3538         xdf_attach,             /* devo_attach */
3539         xdf_detach,             /* devo_detach */
3540         nodev,                  /* devo_reset */
3541         &xdf_cbops,         /* devo_cb_ops */
3542         NULL,                   /* devo_bus_ops */
3543         NULL,                   /* devo_power */
3544         ddi_quiesce_not_supported, /* devo_quiesce */
3545 };
3546 
3547 /*
3548  * Module linkage structures.
3549  */
3550 static struct modldrv modldrv = {
3551         &mod_driverops,             /* Type of module.  This one is a driver */
3552         "virtual block driver", /* short description */
3553         &xdf_devops         /* driver specific ops */
3554 };
3555 
3556 static struct modlinkage xdf_modlinkage = {
3557         MODREV_1, (void *)&modldrv, NULL
3558 };
3559 
3560 /*
3561  * standard module entry points
3562  */
3563 int
3564 _init(void)
3565 {
3566         int rc;
3567 
3568         xdf_major = ddi_name_to_major("xdf");
3569         if (xdf_major == (major_t)-1)
3570                 return (EINVAL);
3571 
3572         if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3573                 return (rc);
3574 
3575         xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3576             sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3577         xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3578             sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3579 
3580 #if defined(XPV_HVM_DRIVER)
3581         xdf_hvm_init();
3582 #endif /* XPV_HVM_DRIVER */
3583 
3584         if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3585 #if defined(XPV_HVM_DRIVER)
3586                 xdf_hvm_fini();
3587 #endif /* XPV_HVM_DRIVER */
3588                 kmem_cache_destroy(xdf_vreq_cache);
3589                 kmem_cache_destroy(xdf_gs_cache);
3590                 ddi_soft_state_fini(&xdf_ssp);
3591                 return (rc);
3592         }
3593 
3594         return (rc);
3595 }
3596 
3597 int
3598 _fini(void)
3599 {
3600         int err;
3601         if ((err = mod_remove(&xdf_modlinkage)) != 0)
3602                 return (err);
3603 
3604 #if defined(XPV_HVM_DRIVER)
3605         xdf_hvm_fini();
3606 #endif /* XPV_HVM_DRIVER */
3607 
3608         kmem_cache_destroy(xdf_vreq_cache);
3609         kmem_cache_destroy(xdf_gs_cache);
3610         ddi_soft_state_fini(&xdf_ssp);
3611 
3612         return (0);
3613 }
3614 
3615 int
3616 _info(struct modinfo *modinfop)
3617 {
3618         return (mod_info(&xdf_modlinkage, modinfop));
3619 }