1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 
  28 /*
  29  * Storage Volume Character and Block Driver (SV)
  30  *
  31  * This driver implements a simplistic /dev/{r}dsk/ interface to a
  32  * specified disk volume that is otherwise managed by the Prism
  33  * software.  The SV driver layers itself onto the underlying disk
  34  * device driver by changing function pointers in the cb_ops
  35  * structure.
  36  *
  37  * CONFIGURATION:
  38  *
  39  * 1. Configure the driver using the svadm utility.
  40  * 2. Access the device as before through /dev/rdsk/c?t?d?s?
  41  *
  42  * LIMITATIONS:
  43  *
  44  * This driver should NOT be used to share a device between another
  45  * DataServices user interface module (e.g., STE) and a user accessing
  46  * the device through the block device in O_WRITE mode.  This is because
  47  * writes through the block device are asynchronous (due to the page
  48  * cache) and so consistency between the block device user and the
  49  * STE user cannot be guaranteed.
  50  *
  51  * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
  52  * wasteful and slow.
  53  */
  54 
  55 #include <sys/debug.h>
  56 #include <sys/types.h>
  57 
  58 #include <sys/ksynch.h>
  59 #include <sys/kmem.h>
  60 #include <sys/errno.h>
  61 #include <sys/varargs.h>
  62 #include <sys/file.h>
  63 #include <sys/open.h>
  64 #include <sys/conf.h>
  65 #include <sys/cred.h>
  66 #include <sys/buf.h>
  67 #include <sys/uio.h>
  68 #ifndef DS_DDICT
  69 #include <sys/pathname.h>
  70 #endif
  71 #include <sys/aio_req.h>
  72 #include <sys/dkio.h>
  73 #include <sys/vtoc.h>
  74 #include <sys/cmn_err.h>
  75 #include <sys/modctl.h>
  76 #include <sys/ddi.h>
  77 #include <sys/sysmacros.h>
  78 #include <sys/sunddi.h>
  79 #include <sys/sunldi.h>
  80 #include <sys/nsctl/nsvers.h>
  81 
  82 #include <sys/nsc_thread.h>
  83 #include <sys/unistat/spcs_s.h>
  84 #include <sys/unistat/spcs_s_k.h>
  85 #include <sys/unistat/spcs_errors.h>
  86 
  87 #ifdef DS_DDICT
  88 #include "../contract.h"
  89 #endif
  90 
  91 #include "../nsctl.h"
  92 
  93 
  94 #include <sys/sdt.h>              /* dtrace is S10 or later */
  95 
  96 #include "sv.h"
  97 #include "sv_impl.h"
  98 #include "sv_efi.h"
  99 
 100 #define MAX_EINTR_COUNT 1000
 101 
 102 /*
 103  * sv_mod_status
 104  */
 105 #define SV_PREVENT_UNLOAD 1
 106 #define SV_ALLOW_UNLOAD 2
 107 
 108 static const int sv_major_rev = ISS_VERSION_MAJ;        /* Major number */
 109 static const int sv_minor_rev = ISS_VERSION_MIN;        /* Minor number */
 110 static const int sv_micro_rev = ISS_VERSION_MIC;        /* Micro number */
 111 static const int sv_baseline_rev = ISS_VERSION_NUM;     /* Baseline number */
 112 
 113 #ifdef DKIOCPARTITION
 114 /*
 115  * CRC32 polynomial table needed for computing the checksums
 116  * in an EFI vtoc.
 117  */
 118 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
 119 #endif
 120 
 121 static clock_t sv_config_time;          /* Time of successful {en,dis}able */
 122 static int sv_debug;                    /* Set non-zero for debug to syslog */
 123 static int sv_mod_status;               /* Set to prevent modunload */
 124 
 125 static dev_info_t *sv_dip;              /* Single DIP for driver */
 126 static kmutex_t sv_mutex;               /* Protect global lists, etc. */
 127 
 128 static nsc_mem_t        *sv_mem;        /* nsctl memory allocator token */
 129 
 130 
 131 /*
 132  * Per device and per major state.
 133  */
 134 
 135 #ifndef _SunOS_5_6
 136 #define UNSAFE_ENTER()
 137 #define UNSAFE_EXIT()
 138 #else
 139 #define UNSAFE_ENTER()  mutex_enter(&unsafe_driver)
 140 #define UNSAFE_EXIT()   mutex_exit(&unsafe_driver)
 141 #endif
 142 
 143                                         /* hash table of major dev structures */
 144 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
 145 static sv_dev_t *sv_devs;               /* array of per device structures */
 146 static int sv_max_devices;              /* SV version of nsc_max_devices() */
 147 static int sv_ndevices;                 /* number of SV enabled devices */
 148 
 149 /*
 150  * Threading.
 151  */
 152 
 153 int sv_threads_max = 1024;              /* maximum # to dynamically alloc */
 154 int sv_threads = 32;                    /* # to pre-allocate (see sv.conf) */
 155 int sv_threads_extra = 0;               /* addl # we would have alloc'ed */
 156 
 157 static nstset_t *sv_tset;               /* the threadset pointer */
 158 
 159 static int sv_threads_hysteresis = 4;   /* hysteresis for threadset resizing */
 160 static int sv_threads_dev = 2;          /* # of threads to alloc per device */
 161 static int sv_threads_inc = 8;          /* increment for changing the set */
 162 static int sv_threads_needed;           /* number of threads needed */
 163 static int sv_no_threads;               /* number of nsc_create errors */
 164 static int sv_max_nlive;                /* max number of threads running */
 165 
 166 
 167 
 168 /*
 169  * nsctl fd callbacks.
 170  */
 171 
 172 static int svattach_fd(blind_t);
 173 static int svdetach_fd(blind_t);
 174 
 175 static nsc_def_t sv_fd_def[] = {
 176         { "Attach",     (uintptr_t)svattach_fd, },
 177         { "Detach",     (uintptr_t)svdetach_fd, },
 178         { 0, 0, }
 179 };
 180 
 181 /*
 182  * cb_ops functions.
 183  */
 184 
 185 static int svopen(dev_t *, int, int, cred_t *);
 186 static int svclose(dev_t, int, int, cred_t *);
 187 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 188 static int svprint(dev_t, char *);
 189 
 190 /*
 191  * These next functions are layered into the underlying driver's devops.
 192  */
 193 
 194 static int sv_lyr_open(dev_t *, int, int, cred_t *);
 195 static int sv_lyr_close(dev_t, int, int, cred_t *);
 196 static int sv_lyr_strategy(struct buf *);
 197 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
 198 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
 199 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
 200 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
 201 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 202 
 203 static struct cb_ops sv_cb_ops = {
 204         svopen,         /* open */
 205         svclose,        /* close */
 206         nulldev,        /* strategy */
 207         svprint,
 208         nodev,          /* dump */
 209         nodev,          /* read */
 210         nodev,          /* write */
 211         svioctl,
 212         nodev,          /* devmap */
 213         nodev,          /* mmap */
 214         nodev,          /* segmap */
 215         nochpoll,       /* poll */
 216         ddi_prop_op,
 217         NULL,           /* NOT a stream */
 218         D_NEW | D_MP | D_64BIT,
 219         CB_REV,
 220         nodev,          /* aread */
 221         nodev,          /* awrite */
 222 };
 223 
 224 
 225 /*
 226  * dev_ops functions.
 227  */
 228 
 229 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 230 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
 231 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
 232 
 233 static struct dev_ops sv_ops = {
 234         DEVO_REV,
 235         0,
 236         sv_getinfo,
 237         nulldev,        /* identify */
 238         nulldev,        /* probe */
 239         sv_attach,
 240         sv_detach,
 241         nodev,          /* reset */
 242         &sv_cb_ops,
 243         (struct bus_ops *)0
 244 };
 245 
 246 /*
 247  * Module linkage.
 248  */
 249 
 250 extern struct mod_ops mod_driverops;
 251 
 252 static struct modldrv modldrv = {
 253         &mod_driverops,
 254         "nws:Storage Volume:" ISS_VERSION_STR,
 255         &sv_ops
 256 };
 257 
 258 static struct modlinkage modlinkage = {
 259         MODREV_1,
 260         { &modldrv, NULL }
 261 };
 262 
 263 
 264 int
 265 _init(void)
 266 {
 267         int error;
 268 
 269         mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
 270 
 271         if ((error = mod_install(&modlinkage)) != 0) {
 272                 mutex_destroy(&sv_mutex);
 273                 return (error);
 274         }
 275 
 276 #ifdef DEBUG
 277         cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
 278             sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
 279             ISS_VERSION_STR, BUILD_DATE_STR);
 280 #else
 281         if (sv_micro_rev) {
 282                 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
 283                     sv_major_rev, sv_minor_rev, sv_micro_rev,
 284                     ISS_VERSION_STR, BUILD_DATE_STR);
 285         } else {
 286                 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
 287                     sv_major_rev, sv_minor_rev,
 288                     ISS_VERSION_STR, BUILD_DATE_STR);
 289         }
 290 #endif
 291 
 292         return (error);
 293 }
 294 
 295 
 296 int
 297 _fini(void)
 298 {
 299         int error;
 300 
 301         if ((error = mod_remove(&modlinkage)) != 0)
 302                 return (error);
 303 
 304         mutex_destroy(&sv_mutex);
 305 
 306         return (error);
 307 }
 308 
 309 
 310 int
 311 _info(struct modinfo *modinfop)
 312 {
 313         return (mod_info(&modlinkage, modinfop));
 314 }
 315 
 316 
 317 /*
 318  * Locking & State.
 319  *
 320  * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
 321  * threadset creation and sizing; sv_ndevices.
 322  *
 323  * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
 324  * must be acquired first.
 325  *
 326  * sv_lock protects the sv_dev_t structure for an individual device.
 327  *
 328  * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
 329  * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
 330  * first.
 331  *
 332  * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
 333  * I/O operations to a device simultaneously, as above.
 334  *
 335  * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
 336  * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
 337  * and (sv_pending == curthread) so that any recursion through
 338  * sv_lyr_open/sv_lyr_close can be detected.
 339  */
 340 
 341 
 342 static int
 343 sv_init_devs(void)
 344 {
 345         int i;
 346 
 347         ASSERT(MUTEX_HELD(&sv_mutex));
 348 
 349         if (sv_max_devices > 0)
 350                 return (0);
 351 
 352         sv_max_devices = nsc_max_devices();
 353 
 354         if (sv_max_devices <= 0) {
 355                 /* nsctl is not attached (nskernd not running) */
 356                 if (sv_debug > 0)
 357                         cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
 358                 return (EAGAIN);
 359         }
 360 
 361         sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
 362             KM_NOSLEEP, sv_mem);
 363 
 364         if (sv_devs == NULL) {
 365                 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
 366                 return (ENOMEM);
 367         }
 368 
 369         for (i = 0; i < sv_max_devices; i++) {
 370                 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
 371                 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
 372         }
 373 
 374         if (sv_debug > 0)
 375                 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
 376 
 377         return (0);
 378 }
 379 
 380 
 381 static int
 382 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 383 {
 384         int rc;
 385 
 386         switch (cmd) {
 387 
 388         case DDI_ATTACH:
 389                 sv_dip = dip;
 390 
 391                 if (ddi_create_minor_node(dip, "sv", S_IFCHR,
 392                     0, DDI_PSEUDO, 0) != DDI_SUCCESS)
 393                         goto failed;
 394 
 395                 mutex_enter(&sv_mutex);
 396 
 397                 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
 398                 if (sv_mem == NULL) {
 399                         mutex_exit(&sv_mutex);
 400                         goto failed;
 401                 }
 402 
 403                 rc = sv_init_devs();
 404                 if (rc != 0 && rc != EAGAIN) {
 405                         mutex_exit(&sv_mutex);
 406                         goto failed;
 407                 }
 408 
 409                 mutex_exit(&sv_mutex);
 410 
 411 
 412                 ddi_report_dev(dip);
 413 
 414                 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 415                     DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
 416                     "sv_threads", sv_threads);
 417 
 418                 if (sv_debug > 0)
 419                         cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
 420 
 421                 if (sv_threads > sv_threads_max)
 422                         sv_threads_max = sv_threads;
 423 
 424                 return (DDI_SUCCESS);
 425 
 426         default:
 427                 return (DDI_FAILURE);
 428         }
 429 
 430 failed:
 431         DTRACE_PROBE(sv_attach_failed);
 432         (void) sv_detach(dip, DDI_DETACH);
 433         return (DDI_FAILURE);
 434 }
 435 
 436 
 437 static int
 438 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 439 {
 440         sv_dev_t *svp;
 441         int i;
 442 
 443         switch (cmd) {
 444 
 445         case DDI_DETACH:
 446 
 447                 /*
 448                  * Check that everything is disabled.
 449                  */
 450 
 451                 mutex_enter(&sv_mutex);
 452 
 453                 if (sv_mod_status == SV_PREVENT_UNLOAD) {
 454                         mutex_exit(&sv_mutex);
 455                         DTRACE_PROBE(sv_detach_err_prevent);
 456                         return (DDI_FAILURE);
 457                 }
 458 
 459                 for (i = 0; sv_devs && i < sv_max_devices; i++) {
 460                         svp = &sv_devs[i];
 461 
 462                         if (svp->sv_state != SV_DISABLE) {
 463                                 mutex_exit(&sv_mutex);
 464                                 DTRACE_PROBE(sv_detach_err_busy);
 465                                 return (DDI_FAILURE);
 466                         }
 467                 }
 468 
 469 
 470                 for (i = 0; sv_devs && i < sv_max_devices; i++) {
 471                         mutex_destroy(&sv_devs[i].sv_olock);
 472                         rw_destroy(&sv_devs[i].sv_lock);
 473                 }
 474 
 475                 if (sv_devs) {
 476                         nsc_kmem_free(sv_devs,
 477                             (sv_max_devices * sizeof (*sv_devs)));
 478                         sv_devs = NULL;
 479                 }
 480                 sv_max_devices = 0;
 481 
 482                 if (sv_mem) {
 483                         nsc_unregister_mem(sv_mem);
 484                         sv_mem = NULL;
 485                 }
 486 
 487                 mutex_exit(&sv_mutex);
 488 
 489                 /*
 490                  * Remove all minor nodes.
 491                  */
 492 
 493                 ddi_remove_minor_node(dip, NULL);
 494                 sv_dip = NULL;
 495 
 496                 return (DDI_SUCCESS);
 497 
 498         default:
 499                 return (DDI_FAILURE);
 500         }
 501 }
 502 
 503 static sv_maj_t *
 504 sv_getmajor(const dev_t dev)
 505 {
 506         sv_maj_t **insert, *maj;
 507         major_t umaj = getmajor(dev);
 508 
 509         /*
 510          * See if the hash table entry, or one of the hash chains
 511          * is already allocated for this major number
 512          */
 513         if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
 514                 do {
 515                         if (maj->sm_major == umaj)
 516                                 return (maj);
 517                 } while ((maj = maj->sm_next) != 0);
 518         }
 519 
 520         /*
 521          * If the sv_mutex is held, there is design flaw, as the only non-mutex
 522          * held callers can be sv_enable() or sv_dev_to_sv()
 523          * Return an error, instead of panicing the system
 524          */
 525         if (MUTEX_HELD(&sv_mutex)) {
 526                 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
 527                 return (NULL);
 528         }
 529 
 530         /*
 531          * Determine where to allocate a new element in the hash table
 532          */
 533         mutex_enter(&sv_mutex);
 534         insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
 535         for (maj = *insert; maj; maj = maj->sm_next) {
 536 
 537                 /* Did another thread beat us to it? */
 538                 if (maj->sm_major == umaj)
 539                         return (maj);
 540 
 541                 /* Find a NULL insert point? */
 542                 if (maj->sm_next == NULL)
 543                         insert = &maj->sm_next;
 544         }
 545 
 546         /*
 547          * Located the new insert point
 548          */
 549         *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
 550         if ((maj = *insert) != 0)
 551                 maj->sm_major = umaj;
 552         else
 553                 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
 554 
 555         mutex_exit(&sv_mutex);
 556 
 557         return (maj);
 558 }
 559 
 560 /* ARGSUSED */
 561 
 562 static int
 563 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 564 {
 565         int rc = DDI_FAILURE;
 566 
 567         switch (infocmd) {
 568 
 569         case DDI_INFO_DEVT2DEVINFO:
 570                 *result = sv_dip;
 571                 rc = DDI_SUCCESS;
 572                 break;
 573 
 574         case DDI_INFO_DEVT2INSTANCE:
 575                 /*
 576                  * We only have a single instance.
 577                  */
 578                 *result = 0;
 579                 rc = DDI_SUCCESS;
 580                 break;
 581 
 582         default:
 583                 break;
 584         }
 585 
 586         return (rc);
 587 }
 588 
 589 
 590 /*
 591  * Hashing of devices onto major device structures.
 592  *
 593  * Individual device structures are hashed onto one of the sm_hash[]
 594  * buckets in the relevant major device structure.
 595  *
 596  * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
 597  * searching does not require the mutex because of the sm_seq member.
 598  * sm_seq is incremented on each insertion (-after- hash chain pointer
 599  * manipulation) and each deletion (-before- hash chain pointer
 600  * manipulation).  When searching the hash chain, the seq number is
 601  * checked before accessing each device structure, if the seq number has
 602  * changed, then we restart the search from the top of the hash chain.
 603  * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
 604  * the hash chain (we are guaranteed that this search cannot be
 605  * interrupted).
 606  */
 607 
 608 #define SV_HASH_RETRY   16
 609 
 610 static sv_dev_t *
 611 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
 612 {
 613         minor_t umin = getminor(dev);
 614         sv_dev_t **hb, *next, *svp;
 615         sv_maj_t *maj;
 616         int seq;
 617         int try;
 618 
 619         /* Get major hash table */
 620         maj = sv_getmajor(dev);
 621         if (majpp)
 622                 *majpp = maj;
 623         if (maj == NULL)
 624                 return (NULL);
 625 
 626         if (maj->sm_inuse == 0) {
 627                 DTRACE_PROBE1(
 628                     sv_dev_to_sv_end,
 629                     dev_t, dev);
 630                 return (NULL);
 631         }
 632 
 633         hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
 634         try = 0;
 635 
 636 retry:
 637         if (try > SV_HASH_RETRY)
 638                 mutex_enter(&sv_mutex);
 639 
 640         seq = maj->sm_seq;
 641         for (svp = *hb; svp; svp = next) {
 642                 next = svp->sv_hash;
 643 
 644                 nsc_membar_stld();      /* preserve register load order */
 645 
 646                 if (maj->sm_seq != seq) {
 647                         DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
 648                         try++;
 649                         goto retry;
 650                 }
 651 
 652                 if (svp->sv_dev == dev)
 653                         break;
 654         }
 655 
 656         if (try > SV_HASH_RETRY)
 657                 mutex_exit(&sv_mutex);
 658 
 659         return (svp);
 660 }
 661 
 662 
 663 /*
 664  * Must be called with sv_mutex held.
 665  */
 666 
 667 static int
 668 sv_get_state(const dev_t udev, sv_dev_t **svpp)
 669 {
 670         sv_dev_t **hb, **insert, *svp;
 671         sv_maj_t *maj;
 672         minor_t umin;
 673         int i;
 674 
 675         /* Get major hash table */
 676         if ((maj = sv_getmajor(udev)) == NULL)
 677                 return (NULL);
 678 
 679         /* Determine which minor hash table */
 680         umin = getminor(udev);
 681         hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
 682 
 683         /* look for clash */
 684 
 685         insert = hb;
 686 
 687         for (svp = *hb; svp; svp = svp->sv_hash) {
 688                 if (svp->sv_dev == udev)
 689                         break;
 690 
 691                 if (svp->sv_hash == NULL)
 692                         insert = &svp->sv_hash;
 693         }
 694 
 695         if (svp) {
 696                 DTRACE_PROBE1(
 697                     sv_get_state_enabled,
 698                     dev_t, udev);
 699                 return (SV_EENABLED);
 700         }
 701 
 702         /* look for spare sv_devs slot */
 703 
 704         for (i = 0; i < sv_max_devices; i++) {
 705                 svp = &sv_devs[i];
 706 
 707                 if (svp->sv_state == SV_DISABLE)
 708                         break;
 709         }
 710 
 711         if (i >= sv_max_devices) {
 712                 DTRACE_PROBE1(
 713                     sv_get_state_noslots,
 714                     dev_t, udev);
 715                 return (SV_ENOSLOTS);
 716         }
 717 
 718         svp->sv_state = SV_PENDING;
 719         svp->sv_pending = curthread;
 720 
 721         *insert = svp;
 722         svp->sv_hash = NULL;
 723         maj->sm_seq++;               /* must be after the store to the hash chain */
 724 
 725         *svpp = svp;
 726 
 727         /*
 728          * We do not know the size of the underlying device at
 729          * this stage, so initialise "nblocks" property to
 730          * zero, and update it whenever we succeed in
 731          * nsc_reserve'ing the underlying nsc_fd_t.
 732          */
 733 
 734         svp->sv_nblocks = 0;
 735 
 736         return (0);
 737 }
 738 
 739 
 740 /*
 741  * Remove a device structure from it's hash chain.
 742  * Must be called with sv_mutex held.
 743  */
 744 
 745 static void
 746 sv_rm_hash(sv_dev_t *svp)
 747 {
 748         sv_dev_t **svpp;
 749         sv_maj_t *maj;
 750 
 751         /* Get major hash table */
 752         if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
 753                 return;
 754 
 755         /* remove svp from hash chain */
 756 
 757         svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
 758         while (*svpp) {
 759                 if (*svpp == svp) {
 760                         /*
 761                          * increment of sm_seq must be before the
 762                          * removal from the hash chain
 763                          */
 764                         maj->sm_seq++;
 765                         *svpp = svp->sv_hash;
 766                         break;
 767                 }
 768 
 769                 svpp = &(*svpp)->sv_hash;
 770         }
 771 
 772         svp->sv_hash = NULL;
 773 }
 774 
 775 /*
 776  * Free (disable) a device structure.
 777  * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
 778  * perform the exits during its processing.
 779  */
 780 
 781 static int
 782 sv_free(sv_dev_t *svp, const int error)
 783 {
 784         struct cb_ops *cb_ops;
 785         sv_maj_t *maj;
 786 
 787         /* Get major hash table */
 788         if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
 789                 return (NULL);
 790 
 791         svp->sv_state = SV_PENDING;
 792         svp->sv_pending = curthread;
 793 
 794         /*
 795          * Close the fd's before removing from the hash or swapping
 796          * back the cb_ops pointers so that the cache flushes before new
 797          * io can come in.
 798          */
 799 
 800         if (svp->sv_fd) {
 801                 (void) nsc_close(svp->sv_fd);
 802                 svp->sv_fd = 0;
 803         }
 804 
 805         sv_rm_hash(svp);
 806 
 807         if (error != SV_ESDOPEN &&
 808             error != SV_ELYROPEN && --maj->sm_inuse == 0) {
 809 
 810                 if (maj->sm_dev_ops)
 811                         cb_ops = maj->sm_dev_ops->devo_cb_ops;
 812                 else
 813                         cb_ops = NULL;
 814 
 815                 if (cb_ops && maj->sm_strategy != NULL) {
 816                         cb_ops->cb_strategy = maj->sm_strategy;
 817                         cb_ops->cb_close = maj->sm_close;
 818                         cb_ops->cb_ioctl = maj->sm_ioctl;
 819                         cb_ops->cb_write = maj->sm_write;
 820                         cb_ops->cb_open = maj->sm_open;
 821                         cb_ops->cb_read = maj->sm_read;
 822                         cb_ops->cb_flag = maj->sm_flag;
 823 
 824                         if (maj->sm_awrite)
 825                                 cb_ops->cb_awrite = maj->sm_awrite;
 826 
 827                         if (maj->sm_aread)
 828                                 cb_ops->cb_aread = maj->sm_aread;
 829 
 830                         /*
 831                          * corbin XXX
 832                          * Leave backing device ops in maj->sm_*
 833                          * to handle any requests that might come
 834                          * in during the disable.  This could be
 835                          * a problem however if the backing device
 836                          * driver is changed while we process these
 837                          * requests.
 838                          *
 839                          * maj->sm_strategy = 0;
 840                          * maj->sm_awrite = 0;
 841                          * maj->sm_write = 0;
 842                          * maj->sm_ioctl = 0;
 843                          * maj->sm_close = 0;
 844                          * maj->sm_aread = 0;
 845                          * maj->sm_read = 0;
 846                          * maj->sm_open = 0;
 847                          * maj->sm_flag = 0;
 848                          *
 849                          */
 850                 }
 851 
 852                 if (maj->sm_dev_ops) {
 853                         maj->sm_dev_ops = 0;
 854                 }
 855         }
 856 
 857         if (svp->sv_lh) {
 858                 cred_t *crp = ddi_get_cred();
 859 
 860                 /*
 861                  * Close the protective layered driver open using the
 862                  * Sun Private layered driver i/f.
 863                  */
 864 
 865                 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
 866                 svp->sv_lh = NULL;
 867         }
 868 
 869         svp->sv_timestamp = nsc_lbolt();
 870         svp->sv_state = SV_DISABLE;
 871         svp->sv_pending = NULL;
 872         rw_exit(&svp->sv_lock);
 873         mutex_exit(&sv_mutex);
 874 
 875         return (error);
 876 }
 877 
 878 /*
 879  * Reserve the device, taking into account the possibility that
 880  * the reserve might have to be retried.
 881  */
 882 static int
 883 sv_reserve(nsc_fd_t *fd, int flags)
 884 {
 885         int eintr_count;
 886         int rc;
 887 
 888         eintr_count = 0;
 889         do {
 890                 rc = nsc_reserve(fd, flags);
 891                 if (rc == EINTR) {
 892                         ++eintr_count;
 893                         delay(2);
 894                 }
 895         } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
 896 
 897         return (rc);
 898 }
 899 
 900 static int
 901 sv_enable(const caddr_t path, const int flag,
 902     const dev_t udev, spcs_s_info_t kstatus)
 903 {
 904         struct dev_ops *dev_ops;
 905         struct cb_ops *cb_ops;
 906         sv_dev_t *svp;
 907         sv_maj_t *maj;
 908         nsc_size_t nblocks;
 909         int rc;
 910         cred_t *crp;
 911         ldi_ident_t     li;
 912 
 913         if (udev == (dev_t)-1 || udev == 0) {
 914                 DTRACE_PROBE1(
 915                     sv_enable_err_baddev,
 916                     dev_t, udev);
 917                 return (SV_EBADDEV);
 918         }
 919 
 920         if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
 921                 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
 922                 return (SV_EAMODE);
 923         }
 924 
 925         /* Get major hash table */
 926         if ((maj = sv_getmajor(udev)) == NULL)
 927                 return (SV_EBADDEV);
 928 
 929         mutex_enter(&sv_mutex);
 930 
 931         rc = sv_get_state(udev, &svp);
 932         if (rc) {
 933                 mutex_exit(&sv_mutex);
 934                 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
 935                 return (rc);
 936         }
 937 
 938         rw_enter(&svp->sv_lock, RW_WRITER);
 939 
 940         /*
 941          * Get real fd used for io
 942          */
 943 
 944         svp->sv_dev = udev;
 945         svp->sv_flag = flag;
 946 
 947         /*
 948          * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
 949          * function pointer before sv swaps them out.
 950          */
 951 
 952         svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
 953             sv_fd_def, (blind_t)udev, &rc);
 954 
 955         if (svp->sv_fd == NULL) {
 956                 if (kstatus)
 957                         spcs_s_add(kstatus, rc);
 958                 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
 959                 return (sv_free(svp, SV_ESDOPEN));
 960         }
 961 
 962         /*
 963          * Perform a layered driver open using the Sun Private layered
 964          * driver i/f to ensure that the cb_ops structure for the driver
 965          * is not detached out from under us whilst sv is enabled.
 966          *
 967          */
 968 
 969         crp = ddi_get_cred();
 970         svp->sv_lh = NULL;
 971 
 972         if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
 973                 rc = ldi_open_by_dev(&svp->sv_dev,
 974                     OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
 975         }
 976 
 977         if (rc != 0) {
 978                 if (kstatus)
 979                         spcs_s_add(kstatus, rc);
 980                 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
 981                 return (sv_free(svp, SV_ELYROPEN));
 982         }
 983 
 984         /*
 985          * Do layering if required - must happen after nsc_open().
 986          */
 987 
 988         if (maj->sm_inuse++ == 0) {
 989                 maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
 990 
 991                 if (maj->sm_dev_ops == NULL ||
 992                     maj->sm_dev_ops->devo_cb_ops == NULL) {
 993                         DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
 994                         return (sv_free(svp, SV_ELOAD));
 995                 }
 996 
 997                 dev_ops = maj->sm_dev_ops;
 998                 cb_ops = dev_ops->devo_cb_ops;
 999 
1000                 if (cb_ops->cb_strategy == NULL ||
1001                     cb_ops->cb_strategy == nodev ||
1002                     cb_ops->cb_strategy == nulldev) {
1003                         DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1004                         return (sv_free(svp, SV_ELOAD));
1005                 }
1006 
1007                 if (cb_ops->cb_strategy == sv_lyr_strategy) {
1008                         DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1009                         return (sv_free(svp, SV_ESTRATEGY));
1010                 }
1011 
1012                 maj->sm_strategy = cb_ops->cb_strategy;
1013                 maj->sm_close = cb_ops->cb_close;
1014                 maj->sm_ioctl = cb_ops->cb_ioctl;
1015                 maj->sm_write = cb_ops->cb_write;
1016                 maj->sm_open = cb_ops->cb_open;
1017                 maj->sm_read = cb_ops->cb_read;
1018                 maj->sm_flag = cb_ops->cb_flag;
1019 
1020                 cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1021                 cb_ops->cb_strategy = sv_lyr_strategy;
1022                 cb_ops->cb_close = sv_lyr_close;
1023                 cb_ops->cb_ioctl = sv_lyr_ioctl;
1024                 cb_ops->cb_write = sv_lyr_write;
1025                 cb_ops->cb_open = sv_lyr_open;
1026                 cb_ops->cb_read = sv_lyr_read;
1027 
1028                 /*
1029                  * Check that the driver has async I/O entry points
1030                  * before changing them.
1031                  */
1032 
1033                 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1034                         maj->sm_awrite = 0;
1035                         maj->sm_aread = 0;
1036                 } else {
1037                         maj->sm_awrite = cb_ops->cb_awrite;
1038                         maj->sm_aread = cb_ops->cb_aread;
1039 
1040                         cb_ops->cb_awrite = sv_lyr_awrite;
1041                         cb_ops->cb_aread = sv_lyr_aread;
1042                 }
1043 
1044                 /*
1045                  * Bug 4645743
1046                  *
1047                  * Prevent sv from ever unloading after it has interposed
1048                  * on a major device because there is a race between
1049                  * sv removing its layered entry points from the target
1050                  * dev_ops, a client coming in and accessing the driver,
1051                  * and the kernel modunloading the sv text.
1052                  *
1053                  * To allow unload, do svboot -u, which only happens in
1054                  * pkgrm time.
1055                  */
1056                 ASSERT(MUTEX_HELD(&sv_mutex));
1057                 sv_mod_status = SV_PREVENT_UNLOAD;
1058         }
1059 
1060 
1061         svp->sv_timestamp = nsc_lbolt();
1062         svp->sv_state = SV_ENABLE;
1063         svp->sv_pending = NULL;
1064         rw_exit(&svp->sv_lock);
1065 
1066         sv_ndevices++;
1067         mutex_exit(&sv_mutex);
1068 
1069         nblocks = 0;
1070         if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1071                 nblocks = svp->sv_nblocks;
1072                 nsc_release(svp->sv_fd);
1073         }
1074 
1075         cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1076             svp->sv_dev, nblocks);
1077 
1078         return (0);
1079 }
1080 
1081 
1082 static int
1083 sv_prepare_unload()
1084 {
1085         int rc = 0;
1086 
1087         mutex_enter(&sv_mutex);
1088 
1089         if (sv_mod_status == SV_PREVENT_UNLOAD) {
1090                 if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1091                         rc = EBUSY;
1092                 } else {
1093                         sv_mod_status = SV_ALLOW_UNLOAD;
1094                         delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1095                 }
1096         }
1097 
1098         mutex_exit(&sv_mutex);
1099         return (rc);
1100 }
1101 
1102 static int
1103 svattach_fd(blind_t arg)
1104 {
1105         dev_t dev = (dev_t)arg;
1106         sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1107         int rc;
1108 
1109         if (sv_debug > 0)
1110                 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1111 
1112         if (svp == NULL) {
1113                 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1114                 return (0);
1115         }
1116 
1117         if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1118                 cmn_err(CE_WARN,
1119                     "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1120                 svp->sv_nblocks = 0;
1121         }
1122 
1123         if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1124                 cmn_err(CE_WARN,
1125                     "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1126                 svp->sv_maxfbas = 0;
1127         }
1128 
1129         if (sv_debug > 0) {
1130                 cmn_err(CE_CONT,
1131                     "!svattach_fd(%p): size %" NSC_SZFMT ", "
1132                     "maxfbas %" NSC_SZFMT "\n",
1133                     arg, svp->sv_nblocks, svp->sv_maxfbas);
1134         }
1135 
1136         return (0);
1137 }
1138 
1139 
1140 static int
1141 svdetach_fd(blind_t arg)
1142 {
1143         dev_t dev = (dev_t)arg;
1144         sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1145 
1146         if (sv_debug > 0)
1147                 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1148 
1149         /* svp can be NULL during disable of an sv */
1150         if (svp == NULL)
1151                 return (0);
1152 
1153         svp->sv_maxfbas = 0;
1154         svp->sv_nblocks = 0;
1155         return (0);
1156 }
1157 
1158 
1159 /*
1160  * Side effect: if called with (guard != 0), then expects both sv_mutex
1161  * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1162  */
1163 
1164 /* ARGSUSED */
1165 static int
1166 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1167 {
1168         sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1169 
1170         if (svp == NULL) {
1171 
1172                 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1173                 return (SV_ENODEV);
1174         }
1175 
1176         mutex_enter(&sv_mutex);
1177         rw_enter(&svp->sv_lock, RW_WRITER);
1178 
1179         if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1180                 rw_exit(&svp->sv_lock);
1181                 mutex_exit(&sv_mutex);
1182 
1183                 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1184                 return (SV_EDISABLED);
1185         }
1186 
1187 
1188         sv_ndevices--;
1189         return (sv_free(svp, 0));
1190 }
1191 
1192 
1193 
1194 static int
1195 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1196 {
1197         nsc_buf_t *tmph;
1198         sv_dev_t *svp;
1199         sv_maj_t *maj;
1200         int (*fn)();
1201         dev_t odev;
1202         int ret;
1203         int rc;
1204 
1205         svp = sv_dev_to_sv(*devp, &maj);
1206 
1207         if (svp) {
1208                 if (svp->sv_state == SV_PENDING &&
1209                     svp->sv_pending == curthread) {
1210                         /*
1211                          * This is a recursive open from a call to
1212                          * ddi_lyr_open_by_devt and so we just want
1213                          * to pass it straight through to the
1214                          * underlying driver.
1215                          */
1216                         DTRACE_PROBE2(sv_lyr_open_recursive,
1217                             sv_dev_t *, svp,
1218                             dev_t, *devp);
1219                         svp = NULL;
1220                 } else
1221                         rw_enter(&svp->sv_lock, RW_READER);
1222         }
1223 
1224         odev = *devp;
1225 
1226         if (maj && (fn = maj->sm_open) != 0) {
1227                 if (!(maj->sm_flag & D_MP)) {
1228                         UNSAFE_ENTER();
1229                         ret = (*fn)(devp, flag, otyp, crp);
1230                         UNSAFE_EXIT();
1231                 } else {
1232                         ret = (*fn)(devp, flag, otyp, crp);
1233                 }
1234 
1235                 if (ret == 0) {
1236                         /*
1237                          * Re-acquire svp if the driver changed *devp.
1238                          */
1239 
1240                         if (*devp != odev) {
1241                                 if (svp != NULL)
1242                                         rw_exit(&svp->sv_lock);
1243 
1244                                 svp = sv_dev_to_sv(*devp, NULL);
1245 
1246                                 if (svp) {
1247                                         rw_enter(&svp->sv_lock, RW_READER);
1248                                 }
1249                         }
1250                 }
1251         } else {
1252                 ret = ENODEV;
1253         }
1254 
1255         if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1256                 /*
1257                  * Underlying DDI open failed, but we have this
1258                  * device SV enabled.  If we can read some data
1259                  * from the device, fake a successful open (this
1260                  * probably means that this device is RDC'd and we
1261                  * are getting the data from the secondary node).
1262                  *
1263                  * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1264                  * ensure that it does not deadlock if this open is
1265                  * coming from nskernd:get_bsize().
1266                  */
1267                 rc = sv_reserve(svp->sv_fd,
1268                     NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1269                 if (rc == 0) {
1270                         tmph = NULL;
1271 
1272                         rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1273                         if (rc <= 0) {
1274                                 /* success */
1275                                 ret = 0;
1276                         }
1277 
1278                         if (tmph) {
1279                                 (void) nsc_free_buf(tmph);
1280                                 tmph = NULL;
1281                         }
1282 
1283                         nsc_release(svp->sv_fd);
1284 
1285                         /*
1286                          * Count the number of layered opens that we
1287                          * fake since we have to fake a matching number
1288                          * of closes (OTYP_LYR open/close calls must be
1289                          * paired).
1290                          */
1291 
1292                         if (ret == 0 && otyp == OTYP_LYR) {
1293                                 mutex_enter(&svp->sv_olock);
1294                                 svp->sv_openlcnt++;
1295                                 mutex_exit(&svp->sv_olock);
1296                         }
1297                 }
1298         }
1299 
1300         if (svp) {
1301                 rw_exit(&svp->sv_lock);
1302         }
1303 
1304         return (ret);
1305 }
1306 
1307 
1308 static int
1309 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1310 {
1311         sv_dev_t *svp;
1312         sv_maj_t *maj;
1313         int (*fn)();
1314         int ret;
1315 
1316         svp = sv_dev_to_sv(dev, &maj);
1317 
1318         if (svp &&
1319             svp->sv_state == SV_PENDING &&
1320             svp->sv_pending == curthread) {
1321                 /*
1322                  * This is a recursive open from a call to
1323                  * ddi_lyr_close and so we just want
1324                  * to pass it straight through to the
1325                  * underlying driver.
1326                  */
1327                 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1328                     dev_t, dev);
1329                 svp = NULL;
1330         }
1331 
1332         if (svp) {
1333                 rw_enter(&svp->sv_lock, RW_READER);
1334 
1335                 if (otyp == OTYP_LYR) {
1336                         mutex_enter(&svp->sv_olock);
1337 
1338                         if (svp->sv_openlcnt) {
1339                                 /*
1340                                  * Consume sufficient layered closes to
1341                                  * account for the opens that we faked
1342                                  * whilst the device was failed.
1343                                  */
1344                                 svp->sv_openlcnt--;
1345                                 mutex_exit(&svp->sv_olock);
1346                                 rw_exit(&svp->sv_lock);
1347 
1348                                 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1349 
1350                                 return (0);
1351                         }
1352 
1353                         mutex_exit(&svp->sv_olock);
1354                 }
1355         }
1356 
1357         if (maj && (fn = maj->sm_close) != 0) {
1358                 if (!(maj->sm_flag & D_MP)) {
1359                         UNSAFE_ENTER();
1360                         ret = (*fn)(dev, flag, otyp, crp);
1361                         UNSAFE_EXIT();
1362                 } else {
1363                         ret = (*fn)(dev, flag, otyp, crp);
1364                 }
1365         } else {
1366                 ret = ENODEV;
1367         }
1368 
1369         if (svp) {
1370                 rw_exit(&svp->sv_lock);
1371         }
1372 
1373         return (ret);
1374 }
1375 
1376 
1377 /*
1378  * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1379  * return NULL.
1380  */
1381 static sv_dev_t *
1382 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1383 {
1384         sv_dev_t *svp;
1385 
1386         while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1387                 rw_enter(&svp->sv_lock, RW_READER);
1388 
1389                 if (svp->sv_state == SV_ENABLE) {
1390                         /* locked and enabled */
1391                         break;
1392                 }
1393 
1394                 /*
1395                  * State was changed while waiting on the lock.
1396                  * Wait for a stable state.
1397                  */
1398                 rw_exit(&svp->sv_lock);
1399 
1400                 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1401 
1402                 delay(2);
1403         }
1404 
1405         return (svp);
1406 }
1407 
1408 
1409 static int
1410 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1411 {
1412         sv_dev_t *svp;
1413         sv_maj_t *maj;
1414         int (*fn)();
1415         int rc;
1416 
1417         svp = sv_find_enabled(dev, &maj);
1418         if (svp == NULL) {
1419                 if (maj) {
1420                         if (rw == NSC_READ)
1421                                 fn = maj->sm_read;
1422                         else
1423                                 fn = maj->sm_write;
1424 
1425                         if (fn != 0) {
1426                                 if (!(maj->sm_flag & D_MP)) {
1427                                         UNSAFE_ENTER();
1428                                         rc = (*fn)(dev, uiop, crp);
1429                                         UNSAFE_EXIT();
1430                                 } else {
1431                                         rc = (*fn)(dev, uiop, crp);
1432                                 }
1433                         }
1434 
1435                         return (rc);
1436                 } else {
1437                         return (ENODEV);
1438                 }
1439         }
1440 
1441         ASSERT(RW_READ_HELD(&svp->sv_lock));
1442 
1443         if (svp->sv_flag == 0) {
1444                 /*
1445                  * guard access mode
1446                  * - prevent user level access to the device
1447                  */
1448                 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1449                 rc = EPERM;
1450                 goto out;
1451         }
1452 
1453         if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1454                 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1455                 goto out;
1456         }
1457 
1458         if (rw == NSC_READ)
1459                 rc = nsc_uread(svp->sv_fd, uiop, crp);
1460         else
1461                 rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1462 
1463         nsc_release(svp->sv_fd);
1464 
1465 out:
1466         rw_exit(&svp->sv_lock);
1467 
1468         return (rc);
1469 }
1470 
1471 
1472 static int
1473 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1474 {
1475         return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1476 }
1477 
1478 
1479 static int
1480 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1481 {
1482         return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1483 }
1484 
1485 
1486 /* ARGSUSED */
1487 
1488 static int
1489 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1490 {
1491         return (aphysio(sv_lyr_strategy,
1492             anocancel, dev, B_READ, minphys, aio));
1493 }
1494 
1495 
1496 /* ARGSUSED */
1497 
1498 static int
1499 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1500 {
1501         return (aphysio(sv_lyr_strategy,
1502             anocancel, dev, B_WRITE, minphys, aio));
1503 }
1504 
1505 
1506 /*
1507  * Set up an array containing the list of raw path names
1508  * The array for the paths is svl and the size of the array is
1509  * in size.
1510  *
1511  * If there are more layered devices than will fit in the array,
1512  * the number of extra layered devices is returned.  Otherwise
1513  * zero is return.
1514  *
1515  * Input:
1516  *      svn     : array for paths
1517  *      size    : size of the array
1518  *
1519  * Output (extra):
1520  *      zero    : All paths fit in array
1521  *      >0   : Number of defined layered devices don't fit in array
1522  */
1523 
1524 static int
1525 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1526 {
1527         sv_name32_t *svn32;
1528         sv_name_t *svn;
1529         sv_dev_t *svp;
1530         int *mode, *nblocks;
1531         int i, index;
1532         char *path;
1533 
1534         *extra = 0;
1535         index = 0;
1536 
1537         if (ilp32)
1538                 svn32 = ptr;
1539         else
1540                 svn = ptr;
1541 
1542         mutex_enter(&sv_mutex);
1543         for (i = 0; i < sv_max_devices; i++) {
1544                 svp = &sv_devs[i];
1545 
1546                 rw_enter(&svp->sv_lock, RW_READER);
1547 
1548                 if (svp->sv_state != SV_ENABLE) {
1549                         rw_exit(&svp->sv_lock);
1550                         continue;
1551                 }
1552 
1553                 if ((*extra) != 0 || ptr == NULL) {
1554                         /* Another overflow entry */
1555                         rw_exit(&svp->sv_lock);
1556                         (*extra)++;
1557                         continue;
1558                 }
1559 
1560                 if (ilp32) {
1561                         nblocks = &svn32->svn_nblocks;
1562                         mode = &svn32->svn_mode;
1563                         path = svn32->svn_path;
1564 
1565                         svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1566                         svn32++;
1567                 } else {
1568                         nblocks = &svn->svn_nblocks;
1569                         mode = &svn->svn_mode;
1570                         path = svn->svn_path;
1571 
1572                         svn->svn_timestamp = svp->sv_timestamp;
1573                         svn++;
1574                 }
1575 
1576                 (void) strcpy(path, nsc_pathname(svp->sv_fd));
1577                 *nblocks = svp->sv_nblocks;
1578                 *mode = svp->sv_flag;
1579 
1580                 if (*nblocks == 0) {
1581                         if (sv_debug > 3)
1582                                 cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1583 
1584                         if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1585                                 *nblocks = svp->sv_nblocks;
1586                                 nsc_release(svp->sv_fd);
1587                         }
1588                 }
1589 
1590                 if (++index >= size) {
1591                         /* Out of space */
1592                         (*extra)++;
1593                 }
1594 
1595                 rw_exit(&svp->sv_lock);
1596         }
1597         mutex_exit(&sv_mutex);
1598 
1599         if (index < size) {
1600                 /* NULL terminated list */
1601                 if (ilp32)
1602                         svn32->svn_path[0] = '\0';
1603                 else
1604                         svn->svn_path[0] = '\0';
1605         }
1606 
1607         return (0);
1608 }
1609 
1610 
1611 static void
1612 sv_thread_tune(int threads)
1613 {
1614         int incr = (threads > 0) ? 1 : -1;
1615         int change = 0;
1616         int nthreads;
1617 
1618         ASSERT(MUTEX_HELD(&sv_mutex));
1619 
1620         if (sv_threads_extra) {
1621                 /* keep track of any additional threads requested */
1622                 if (threads > 0) {
1623                         sv_threads_extra += threads;
1624                         return;
1625                 }
1626                 threads = -threads;
1627                 if (threads >= sv_threads_extra) {
1628                         threads -= sv_threads_extra;
1629                         sv_threads_extra = 0;
1630                         /* fall through to while loop */
1631                 } else {
1632                         sv_threads_extra -= threads;
1633                         return;
1634                 }
1635         } else if (threads > 0) {
1636                 /*
1637                  * do not increase the number of threads beyond
1638                  * sv_threads_max when doing dynamic thread tuning
1639                  */
1640                 nthreads = nst_nthread(sv_tset);
1641                 if ((nthreads + threads) > sv_threads_max) {
1642                         sv_threads_extra = nthreads + threads - sv_threads_max;
1643                         threads = sv_threads_max - nthreads;
1644                         if (threads <= 0)
1645                                 return;
1646                 }
1647         }
1648 
1649         if (threads < 0)
1650                 threads = -threads;
1651 
1652         while (threads--) {
1653                 nthreads = nst_nthread(sv_tset);
1654                 sv_threads_needed += incr;
1655 
1656                 if (sv_threads_needed >= nthreads)
1657                         change += nst_add_thread(sv_tset, sv_threads_inc);
1658                 else if ((sv_threads_needed <
1659                     (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1660                     ((nthreads - sv_threads_inc) >= sv_threads))
1661                         change -= nst_del_thread(sv_tset, sv_threads_inc);
1662         }
1663 
1664 #ifdef DEBUG
1665         if (change) {
1666                 cmn_err(CE_NOTE,
1667                     "!sv_thread_tune: threads needed %d, nthreads %d, "
1668                     "nthreads change %d",
1669                     sv_threads_needed, nst_nthread(sv_tset), change);
1670         }
1671 #endif
1672 }
1673 
1674 
1675 /* ARGSUSED */
1676 static int
1677 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1678 {
1679         int rc;
1680 
1681         mutex_enter(&sv_mutex);
1682         rc = sv_init_devs();
1683         mutex_exit(&sv_mutex);
1684 
1685         return (rc);
1686 }
1687 
1688 
1689 /* ARGSUSED */
1690 static int
1691 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1692 {
1693         const int secs = HZ * 5;
1694         const int ticks = HZ / 10;
1695         int loops = secs / ticks;
1696 
1697         mutex_enter(&sv_mutex);
1698         while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1699                 if (nst_nlive(sv_tset) <= 0) {
1700                         nst_destroy(sv_tset);
1701                         sv_tset = NULL;
1702                         break;
1703                 }
1704 
1705                 /* threads still active - wait for them to exit */
1706                 mutex_exit(&sv_mutex);
1707                 delay(ticks);
1708                 loops--;
1709                 mutex_enter(&sv_mutex);
1710         }
1711         mutex_exit(&sv_mutex);
1712 
1713         if (loops <= 0) {
1714                 cmn_err(CE_WARN,
1715 #ifndef DEBUG
1716                     /* do not write to console when non-DEBUG */
1717                     "!"
1718 #endif
1719                     "sv:svclose: threads still active "
1720                     "after %d sec - leaking thread set", secs);
1721         }
1722 
1723         return (0);
1724 }
1725 
1726 
1727 static int
1728 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1729 {
1730         char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1731         spcs_s_info_t kstatus;  /* Kernel version of spcs status */
1732         spcs_s_info_t ustatus;  /* Address of user version of spcs status */
1733         sv_list32_t svl32;      /* 32 bit Initial structure for SVIOC_LIST */
1734         sv_version_t svv;       /* Version structure */
1735         sv_conf_t svc;          /* User config structure */
1736         sv_list_t svl;          /* Initial structure for SVIOC_LIST */
1737         void *usvn;             /* Address of user sv_name_t */
1738         void *svn = NULL;       /* Array for SVIOC_LIST */
1739         uint64_t phash;         /* pathname hash */
1740         int rc = 0;             /* Return code -- errno */
1741         int size;               /* Number of items in array */
1742         int bytes;              /* Byte size of array */
1743         int ilp32;              /* Convert data structures for ilp32 userland */
1744 
1745         *rvalp = 0;
1746 
1747         /*
1748          * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1749          * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1750          * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1751          *
1752          * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1753          */
1754         if (sv_mod_status == SV_ALLOW_UNLOAD) {
1755                 return (EBUSY);
1756         }
1757 
1758         if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1759                 return (rc);
1760 
1761         kstatus = spcs_s_kcreate();
1762         if (!kstatus) {
1763                 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1764                 return (ENOMEM);
1765         }
1766 
1767         ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1768 
1769         switch (cmd) {
1770 
1771         case SVIOC_ENABLE:
1772 
1773                 if (ilp32) {
1774                         sv_conf32_t svc32;
1775 
1776                         if (ddi_copyin((void *)arg, &svc32,
1777                             sizeof (svc32), mode) < 0) {
1778                                 spcs_s_kfree(kstatus);
1779                                 return (EFAULT);
1780                         }
1781 
1782                         svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1783                         (void) strcpy(svc.svc_path, svc32.svc_path);
1784                         svc.svc_flag  = svc32.svc_flag;
1785                         svc.svc_major = svc32.svc_major;
1786                         svc.svc_minor = svc32.svc_minor;
1787                 } else {
1788                         if (ddi_copyin((void *)arg, &svc,
1789                             sizeof (svc), mode) < 0) {
1790                                 spcs_s_kfree(kstatus);
1791                                 return (EFAULT);
1792                         }
1793                 }
1794 
1795                 /* force to raw access */
1796                 svc.svc_flag = NSC_DEVICE;
1797 
1798                 if (sv_tset == NULL) {
1799                         mutex_enter(&sv_mutex);
1800 
1801                         if (sv_tset == NULL) {
1802                                 sv_tset = nst_init("sv_thr", sv_threads);
1803                         }
1804 
1805                         mutex_exit(&sv_mutex);
1806 
1807                         if (sv_tset == NULL) {
1808                                 cmn_err(CE_WARN,
1809                                     "!sv: could not allocate %d threads",
1810                                     sv_threads);
1811                         }
1812                 }
1813 
1814                 rc = sv_enable(svc.svc_path, svc.svc_flag,
1815                     makedevice(svc.svc_major, svc.svc_minor), kstatus);
1816 
1817                 if (rc == 0) {
1818                         sv_config_time = nsc_lbolt();
1819 
1820                         mutex_enter(&sv_mutex);
1821                         sv_thread_tune(sv_threads_dev);
1822                         mutex_exit(&sv_mutex);
1823                 }
1824 
1825                 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1826 
1827                 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1828                 /* NOTREACHED */
1829 
1830         case SVIOC_DISABLE:
1831 
1832                 if (ilp32) {
1833                         sv_conf32_t svc32;
1834 
1835                         if (ddi_copyin((void *)arg, &svc32,
1836                             sizeof (svc32), mode) < 0) {
1837                                 spcs_s_kfree(kstatus);
1838                                 return (EFAULT);
1839                         }
1840 
1841                         svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1842                         svc.svc_major = svc32.svc_major;
1843                         svc.svc_minor = svc32.svc_minor;
1844                         (void) strcpy(svc.svc_path, svc32.svc_path);
1845                         svc.svc_flag  = svc32.svc_flag;
1846                 } else {
1847                         if (ddi_copyin((void *)arg, &svc,
1848                             sizeof (svc), mode) < 0) {
1849                                 spcs_s_kfree(kstatus);
1850                                 return (EFAULT);
1851                         }
1852                 }
1853 
1854                 if (svc.svc_major == (major_t)-1 &&
1855                     svc.svc_minor == (minor_t)-1) {
1856                         sv_dev_t *svp;
1857                         int i;
1858 
1859                         /*
1860                          * User level could not find the minor device
1861                          * node, so do this the slow way by searching
1862                          * the entire sv config for a matching pathname.
1863                          */
1864 
1865                         phash = nsc_strhash(svc.svc_path);
1866 
1867                         mutex_enter(&sv_mutex);
1868 
1869                         for (i = 0; i < sv_max_devices; i++) {
1870                                 svp = &sv_devs[i];
1871 
1872                                 if (svp->sv_state == SV_DISABLE ||
1873                                     svp->sv_fd == NULL)
1874                                         continue;
1875 
1876                                 if (nsc_fdpathcmp(svp->sv_fd, phash,
1877                                     svc.svc_path) == 0) {
1878                                         svc.svc_major = getmajor(svp->sv_dev);
1879                                         svc.svc_minor = getminor(svp->sv_dev);
1880                                         break;
1881                                 }
1882                         }
1883 
1884                         mutex_exit(&sv_mutex);
1885 
1886                         if (svc.svc_major == (major_t)-1 &&
1887                             svc.svc_minor == (minor_t)-1)
1888                                 return (spcs_s_ocopyoutf(&kstatus,
1889                                     svc.svc_error, SV_ENODEV));
1890                 }
1891 
1892                 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1893                     kstatus);
1894 
1895                 if (rc == 0) {
1896                         sv_config_time = nsc_lbolt();
1897 
1898                         mutex_enter(&sv_mutex);
1899                         sv_thread_tune(-sv_threads_dev);
1900                         mutex_exit(&sv_mutex);
1901                 }
1902 
1903                 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1904 
1905                 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1906                 /* NOTREACHED */
1907 
1908         case SVIOC_LIST:
1909 
1910                 if (ilp32) {
1911                         if (ddi_copyin((void *)arg, &svl32,
1912                             sizeof (svl32), mode) < 0) {
1913                                 spcs_s_kfree(kstatus);
1914                                 return (EFAULT);
1915                         }
1916 
1917                         ustatus = (spcs_s_info_t)svl32.svl_error;
1918                         size = svl32.svl_count;
1919                         usvn = (void *)(unsigned long)svl32.svl_names;
1920                 } else {
1921                         if (ddi_copyin((void *)arg, &svl,
1922                             sizeof (svl), mode) < 0) {
1923                                 spcs_s_kfree(kstatus);
1924                                 return (EFAULT);
1925                         }
1926 
1927                         ustatus = svl.svl_error;
1928                         size = svl.svl_count;
1929                         usvn = svl.svl_names;
1930                 }
1931 
1932                 /* Do some boundary checking */
1933                 if ((size < 0) || (size > sv_max_devices)) {
1934                         /* Array size is out of range */
1935                         return (spcs_s_ocopyoutf(&kstatus, ustatus,
1936                             SV_EARRBOUNDS, "0",
1937                             spcs_s_inttostring(sv_max_devices, itmp1,
1938                             sizeof (itmp1), 0),
1939                             spcs_s_inttostring(size, itmp2,
1940                             sizeof (itmp2), 0)));
1941                 }
1942 
1943                 if (ilp32)
1944                         bytes = size * sizeof (sv_name32_t);
1945                 else
1946                         bytes = size * sizeof (sv_name_t);
1947 
1948                 /* Allocate memory for the array of structures */
1949                 if (bytes != 0) {
1950                         svn = kmem_zalloc(bytes, KM_SLEEP);
1951                         if (!svn) {
1952                                 return (spcs_s_ocopyoutf(&kstatus,
1953                                     ustatus, ENOMEM));
1954                         }
1955                 }
1956 
1957                 rc = sv_list(svn, size, rvalp, ilp32);
1958                 if (rc) {
1959                         if (svn != NULL)
1960                                 kmem_free(svn, bytes);
1961                         return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1962                 }
1963 
1964                 if (ilp32) {
1965                         svl32.svl_timestamp = (uint32_t)sv_config_time;
1966                         svl32.svl_maxdevs = (int32_t)sv_max_devices;
1967 
1968                         /* Return the list structure */
1969                         if (ddi_copyout(&svl32, (void *)arg,
1970                             sizeof (svl32), mode) < 0) {
1971                                 spcs_s_kfree(kstatus);
1972                                 if (svn != NULL)
1973                                         kmem_free(svn, bytes);
1974                                 return (EFAULT);
1975                         }
1976                 } else {
1977                         svl.svl_timestamp = sv_config_time;
1978                         svl.svl_maxdevs = sv_max_devices;
1979 
1980                         /* Return the list structure */
1981                         if (ddi_copyout(&svl, (void *)arg,
1982                             sizeof (svl), mode) < 0) {
1983                                 spcs_s_kfree(kstatus);
1984                                 if (svn != NULL)
1985                                         kmem_free(svn, bytes);
1986                                 return (EFAULT);
1987                         }
1988                 }
1989 
1990                 /* Return the array */
1991                 if (svn != NULL) {
1992                         if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1993                                 kmem_free(svn, bytes);
1994                                 spcs_s_kfree(kstatus);
1995                                 return (EFAULT);
1996                         }
1997                         kmem_free(svn, bytes);
1998                 }
1999 
2000                 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
2001 
2002                 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2003                 /* NOTREACHED */
2004 
2005         case SVIOC_VERSION:
2006 
2007                 if (ilp32) {
2008                         sv_version32_t svv32;
2009 
2010                         if (ddi_copyin((void *)arg, &svv32,
2011                             sizeof (svv32), mode) < 0) {
2012                                 spcs_s_kfree(kstatus);
2013                                 return (EFAULT);
2014                         }
2015 
2016                         svv32.svv_major_rev = sv_major_rev;
2017                         svv32.svv_minor_rev = sv_minor_rev;
2018                         svv32.svv_micro_rev = sv_micro_rev;
2019                         svv32.svv_baseline_rev = sv_baseline_rev;
2020 
2021                         if (ddi_copyout(&svv32, (void *)arg,
2022                             sizeof (svv32), mode) < 0) {
2023                                 spcs_s_kfree(kstatus);
2024                                 return (EFAULT);
2025                         }
2026 
2027                         ustatus = (spcs_s_info_t)svv32.svv_error;
2028                 } else {
2029                         if (ddi_copyin((void *)arg, &svv,
2030                             sizeof (svv), mode) < 0) {
2031                                 spcs_s_kfree(kstatus);
2032                                 return (EFAULT);
2033                         }
2034 
2035                         svv.svv_major_rev = sv_major_rev;
2036                         svv.svv_minor_rev = sv_minor_rev;
2037                         svv.svv_micro_rev = sv_micro_rev;
2038                         svv.svv_baseline_rev = sv_baseline_rev;
2039 
2040                         if (ddi_copyout(&svv, (void *)arg,
2041                             sizeof (svv), mode) < 0) {
2042                                 spcs_s_kfree(kstatus);
2043                                 return (EFAULT);
2044                         }
2045 
2046                         ustatus = svv.svv_error;
2047                 }
2048 
2049                 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2050 
2051                 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2052                 /* NOTREACHED */
2053 
2054         case SVIOC_UNLOAD:
2055                 rc = sv_prepare_unload();
2056 
2057                 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2058                         rc = EFAULT;
2059                 }
2060 
2061                 spcs_s_kfree(kstatus);
2062                 return (rc);
2063 
2064         default:
2065                 spcs_s_kfree(kstatus);
2066 
2067                 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2068 
2069                 return (EINVAL);
2070                 /* NOTREACHED */
2071         }
2072 
2073         /* NOTREACHED */
2074 }
2075 
2076 
2077 /* ARGSUSED */
2078 static int
2079 svprint(dev_t dev, char *str)
2080 {
2081         int instance = ddi_get_instance(sv_dip);
2082         cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2083         return (0);
2084 }
2085 
2086 
2087 static void
2088 _sv_lyr_strategy(struct buf *bp)
2089 {
2090         caddr_t buf_addr;               /* pointer to linear buffer in bp */
2091         nsc_buf_t *bufh = NULL;
2092         nsc_buf_t *hndl = NULL;
2093         sv_dev_t *svp;
2094         nsc_vec_t *v;
2095         sv_maj_t *maj;
2096         nsc_size_t fba_req, fba_len;    /* FBA lengths */
2097         nsc_off_t fba_off;              /* FBA offset */
2098         size_t tocopy, nbytes;          /* byte lengths */
2099         int rw, rc;                     /* flags and return codes */
2100         int (*fn)();
2101 
2102         rc = 0;
2103 
2104         if (sv_debug > 5)
2105                 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2106 
2107         svp = sv_find_enabled(bp->b_edev, &maj);
2108         if (svp == NULL) {
2109                 if (maj && (fn = maj->sm_strategy) != 0) {
2110                         if (!(maj->sm_flag & D_MP)) {
2111                                 UNSAFE_ENTER();
2112                                 rc = (*fn)(bp);
2113                                 UNSAFE_EXIT();
2114                         } else {
2115                                 rc = (*fn)(bp);
2116                         }
2117                         return;
2118                 } else {
2119                         bioerror(bp, ENODEV);
2120                         biodone(bp);
2121                         return;
2122                 }
2123         }
2124 
2125         ASSERT(RW_READ_HELD(&svp->sv_lock));
2126 
2127         if (svp->sv_flag == 0) {
2128                 /*
2129                  * guard access mode
2130                  * - prevent user level access to the device
2131                  */
2132                 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2133                 bioerror(bp, EPERM);
2134                 goto out;
2135         }
2136 
2137         if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2138                 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2139 
2140                 if (rc == EINTR)
2141                         cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2142                 bioerror(bp, rc);
2143                 goto out;
2144         }
2145 
2146         if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2147                 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2148 
2149                 if (bp->b_flags & B_READ) {
2150                         /* return EOF, not an error */
2151                         bp->b_resid = bp->b_bcount;
2152                         bioerror(bp, 0);
2153                 } else
2154                         bioerror(bp, EINVAL);
2155 
2156                 goto done;
2157         }
2158 
2159         /*
2160          * Preallocate a handle once per call to strategy.
2161          * If this fails, then the nsc_alloc_buf() will allocate
2162          * a temporary handle per allocation/free pair.
2163          */
2164 
2165         DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2166 
2167         bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2168 
2169         DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2170 
2171         if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2172                 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2173 
2174                 cmn_err(CE_WARN,
2175                     "!sv: allocated active handle (bufh %p, flags %x)",
2176                     (void *)bufh, bufh->sb_flag);
2177 
2178                 bioerror(bp, ENXIO);
2179                 goto done;
2180         }
2181 
2182         fba_req = FBA_LEN(bp->b_bcount);
2183         if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2184                 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2185 
2186         rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2187 
2188         bp_mapin(bp);
2189 
2190         bp->b_resid = bp->b_bcount;
2191         buf_addr = bp->b_un.b_addr;
2192         fba_off = 0;
2193 
2194         /*
2195          * fba_req  - requested size of transfer in FBAs after
2196          *              truncation to device extent, and allowing for
2197          *              possible non-FBA bounded final chunk.
2198          * fba_off  - offset of start of chunk from start of bp in FBAs.
2199          * fba_len  - size of this chunk in FBAs.
2200          */
2201 
2202 loop:
2203         fba_len = min(fba_req, svp->sv_maxfbas);
2204         hndl = bufh;
2205 
2206         DTRACE_PROBE4(sv_dbg_allocb_start,
2207             sv_dev_t *, svp,
2208             uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2209             uint64_t, (uint64_t)fba_len,
2210             int, rw);
2211 
2212         rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2213             fba_len, rw, &hndl);
2214 
2215         DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2216 
2217         if (rc > 0) {
2218                 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2219                 bioerror(bp, rc);
2220                 if (hndl != bufh)
2221                         (void) nsc_free_buf(hndl);
2222                 hndl = NULL;
2223                 goto done;
2224         }
2225 
2226         tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2227         v = hndl->sb_vec;
2228 
2229         if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2230                 /*
2231                  * Not overwriting all of the last FBA, so read in the
2232                  * old contents now before we overwrite it with the new
2233                  * data.
2234                  */
2235 
2236                 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2237                     uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2238 
2239                 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2240                 if (rc > 0) {
2241                         bioerror(bp, rc);
2242                         goto done;
2243                 }
2244 
2245                 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2246         }
2247 
2248         DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2249 
2250         while (tocopy > 0) {
2251                 nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2252 
2253                 if (bp->b_flags & B_READ)
2254                         (void) bcopy(v->sv_addr, buf_addr, nbytes);
2255                 else
2256                         (void) bcopy(buf_addr, v->sv_addr, nbytes);
2257 
2258                 bp->b_resid -= nbytes;
2259                 buf_addr += nbytes;
2260                 tocopy -= nbytes;
2261                 v++;
2262         }
2263 
2264         DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2265 
2266         if ((bp->b_flags & B_READ) == 0) {
2267                 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2268                     uint64_t, (uint64_t)hndl->sb_pos,
2269                     uint64_t, (uint64_t)hndl->sb_len);
2270 
2271                 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2272 
2273                 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2274 
2275                 if (rc > 0) {
2276                         bioerror(bp, rc);
2277                         goto done;
2278                 }
2279         }
2280 
2281         /*
2282          * Adjust FBA offset and requested (ie. remaining) length,
2283          * loop if more data to transfer.
2284          */
2285 
2286         fba_off += fba_len;
2287         fba_req -= fba_len;
2288 
2289         if (fba_req > 0) {
2290                 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2291 
2292                 rc = nsc_free_buf(hndl);
2293 
2294                 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2295 
2296                 if (rc > 0) {
2297                         DTRACE_PROBE1(sv_lyr_strategy_err_free,
2298                             struct buf *, bp);
2299                         bioerror(bp, rc);
2300                 }
2301 
2302                 hndl = NULL;
2303 
2304                 if (rc <= 0)
2305                         goto loop;
2306         }
2307 
2308 done:
2309         if (hndl != NULL) {
2310                 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2311 
2312                 rc = nsc_free_buf(hndl);
2313 
2314                 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2315 
2316                 if (rc > 0) {
2317                         DTRACE_PROBE1(sv_lyr_strategy_err_free,
2318                             struct buf *, bp);
2319                         bioerror(bp, rc);
2320                 }
2321 
2322                 hndl = NULL;
2323         }
2324 
2325         if (bufh)
2326                 (void) nsc_free_handle(bufh);
2327 
2328         DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2329 
2330         nsc_release(svp->sv_fd);
2331 
2332         DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2333 
2334 out:
2335         if (sv_debug > 5) {
2336                 cmn_err(CE_CONT,
2337                     "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2338                     (void *)bp, (void *)bufh, bp->b_error);
2339         }
2340 
2341         DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2342 
2343         rw_exit(&svp->sv_lock);
2344         biodone(bp);
2345 }
2346 
2347 
2348 static void
2349 sv_async_strategy(blind_t arg)
2350 {
2351         struct buf *bp = (struct buf *)arg;
2352         _sv_lyr_strategy(bp);
2353 }
2354 
2355 
2356 static int
2357 sv_lyr_strategy(struct buf *bp)
2358 {
2359         nsthread_t *tp;
2360         int nlive;
2361 
2362         /*
2363          * If B_ASYNC was part of the DDI we could use it as a hint to
2364          * not create a thread for synchronous i/o.
2365          */
2366         if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2367                 /* not sv enabled - just pass through */
2368                 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2369                 _sv_lyr_strategy(bp);
2370                 return (0);
2371         }
2372 
2373         if (sv_debug > 4) {
2374                 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2375                     nst_nthread(sv_tset), nst_nlive(sv_tset));
2376         }
2377 
2378         /*
2379          * If there are only guard devices enabled there
2380          * won't be a threadset, so don't try and use it.
2381          */
2382         tp = NULL;
2383         if (sv_tset != NULL) {
2384                 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2385         }
2386 
2387         if (tp == NULL) {
2388                 /*
2389                  * out of threads, so fall back to synchronous io.
2390                  */
2391                 if (sv_debug > 0) {
2392                         cmn_err(CE_CONT,
2393                             "!sv_lyr_strategy: thread alloc failed\n");
2394                 }
2395 
2396                 DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2397                     struct buf *, bp);
2398 
2399                 _sv_lyr_strategy(bp);
2400                 sv_no_threads++;
2401         } else {
2402                 nlive = nst_nlive(sv_tset);
2403                 if (nlive > sv_max_nlive) {
2404                         if (sv_debug > 0) {
2405                                 cmn_err(CE_CONT,
2406                                     "!sv_lyr_strategy: "
2407                                     "new max nlive %d (nthread %d)\n",
2408                                     nlive, nst_nthread(sv_tset));
2409                         }
2410 
2411                         sv_max_nlive = nlive;
2412                 }
2413         }
2414 
2415         return (0);
2416 }
2417 
2418 /*
2419  * re-write the size of the current partition
2420  */
2421 static int
2422 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2423 {
2424         size_t offset;
2425         int ilp32;
2426         int pnum;
2427         int rc;
2428 
2429         ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2430 
2431         rc = nskern_partition(svp->sv_dev, &pnum);
2432         if (rc != 0) {
2433                 return (rc);
2434         }
2435 
2436         if (pnum < 0 || pnum >= V_NUMPAR) {
2437                 cmn_err(CE_WARN,
2438                     "!sv_gvtoc: unable to determine partition number "
2439                     "for dev %lx", svp->sv_dev);
2440                 return (EINVAL);
2441         }
2442 
2443         if (ilp32) {
2444                 int32_t p_size;
2445 
2446 #ifdef _SunOS_5_6
2447                 offset = offsetof(struct vtoc, v_part);
2448                 offset += sizeof (struct partition) * pnum;
2449                 offset += offsetof(struct partition, p_size);
2450 #else
2451                 offset = offsetof(struct vtoc32, v_part);
2452                 offset += sizeof (struct partition32) * pnum;
2453                 offset += offsetof(struct partition32, p_size);
2454 #endif
2455 
2456                 p_size = (int32_t)svp->sv_nblocks;
2457                 if (p_size == 0) {
2458                         if (sv_reserve(svp->sv_fd,
2459                             NSC_MULTI|NSC_PCATCH) == 0) {
2460                                 p_size = (int32_t)svp->sv_nblocks;
2461                                 nsc_release(svp->sv_fd);
2462                         } else {
2463                                 rc = EINTR;
2464                         }
2465                 }
2466 
2467                 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2468                     sizeof (p_size), mode) != 0) {
2469                         rc = EFAULT;
2470                 }
2471         } else {
2472                 long p_size;
2473 
2474                 offset = offsetof(struct vtoc, v_part);
2475                 offset += sizeof (struct partition) * pnum;
2476                 offset += offsetof(struct partition, p_size);
2477 
2478                 p_size = (long)svp->sv_nblocks;
2479                 if (p_size == 0) {
2480                         if (sv_reserve(svp->sv_fd,
2481                             NSC_MULTI|NSC_PCATCH) == 0) {
2482                                 p_size = (long)svp->sv_nblocks;
2483                                 nsc_release(svp->sv_fd);
2484                         } else {
2485                                 rc = EINTR;
2486                         }
2487                 }
2488 
2489                 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2490                     sizeof (p_size), mode) != 0) {
2491                         rc = EFAULT;
2492                 }
2493         }
2494 
2495         return (rc);
2496 }
2497 
2498 
2499 #ifdef DKIOCPARTITION
2500 /*
2501  * re-write the size of the current partition
2502  *
2503  * arg is dk_efi_t.
2504  *
2505  * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2506  *
2507  * dk_efi_t->dki_data --> efi_gpt_t (label header)
2508  * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2509  *
2510  * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2511  * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2512  *
2513  * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2514  * logical block on the disk.
2515  *
2516  * Everything is little endian (i.e. disk format).
2517  */
2518 static int
2519 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2520 {
2521         dk_efi_t efi;
2522         efi_gpt_t gpt;
2523         efi_gpe_t *gpe = NULL;
2524         size_t sgpe;
2525         uint64_t p_size;        /* virtual partition size from nsctl */
2526         uint32_t crc;
2527         int unparts;            /* number of parts in user's array */
2528         int pnum;
2529         int rc;
2530 
2531         rc = nskern_partition(svp->sv_dev, &pnum);
2532         if (rc != 0) {
2533                 return (rc);
2534         }
2535 
2536         if (pnum < 0) {
2537                 cmn_err(CE_WARN,
2538                     "!sv_efi: unable to determine partition number for dev %lx",
2539                     svp->sv_dev);
2540                 return (EINVAL);
2541         }
2542 
2543         if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2544                 return (EFAULT);
2545         }
2546 
2547         efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2548 
2549         if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2550                 return (EINVAL);
2551         }
2552 
2553         if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2554                 rc = EFAULT;
2555                 goto out;
2556         }
2557 
2558         if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2559                 unparts = 1;
2560         else if (pnum >= unparts) {
2561                 cmn_err(CE_WARN,
2562                     "!sv_efi: partition# beyond end of user array (%d >= %d)",
2563                     pnum, unparts);
2564                 return (EINVAL);
2565         }
2566 
2567         sgpe = sizeof (*gpe) * unparts;
2568         gpe = kmem_alloc(sgpe, KM_SLEEP);
2569 
2570         if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2571                 rc = EFAULT;
2572                 goto out;
2573         }
2574 
2575         p_size = svp->sv_nblocks;
2576         if (p_size == 0) {
2577                 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2578                         p_size = (diskaddr_t)svp->sv_nblocks;
2579                         nsc_release(svp->sv_fd);
2580                 } else {
2581                         rc = EINTR;
2582                 }
2583         }
2584 
2585         gpe[pnum].efi_gpe_EndingLBA = LE_64(
2586             LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2587 
2588         gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2589         CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2590         gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2591 
2592         gpt.efi_gpt_HeaderCRC32 = 0;
2593         CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2594         gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2595 
2596         if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2597                 rc = EFAULT;
2598                 goto out;
2599         }
2600 
2601         if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2602                 rc = EFAULT;
2603                 goto out;
2604         }
2605 
2606 out:
2607         if (gpe) {
2608                 kmem_free(gpe, sgpe);
2609         }
2610 
2611         return (rc);
2612 }
2613 
2614 
2615 /*
2616  * Re-write the size of the partition specified by p_partno
2617  *
2618  * Note that if a DKIOCPARTITION is issued to an fd opened against a
2619  * non-sv'd device, but p_partno requests the size for a different
2620  * device that is sv'd, this function will *not* be called as sv is
2621  * not interposed on the original device (the fd).
2622  *
2623  * It would not be easy to change this as we cannot get the partition
2624  * number for the non-sv'd device, so cannot compute the dev_t of the
2625  * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2626  * its size from nsctl.
2627  *
2628  * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2629  */
2630 static int
2631 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2632 {
2633         struct partition64 p64;
2634         sv_dev_t *nsvp = NULL;
2635         diskaddr_t p_size;
2636         minor_t nminor;
2637         int pnum, rc;
2638         dev_t ndev;
2639 
2640         rc = nskern_partition(svp->sv_dev, &pnum);
2641         if (rc != 0) {
2642                 return (rc);
2643         }
2644 
2645         if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2646                 return (EFAULT);
2647         }
2648 
2649         if (p64.p_partno != pnum) {
2650                 /* switch to requested partition, not the current one */
2651                 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2652                 ndev = makedevice(getmajor(svp->sv_dev), nminor);
2653                 nsvp = sv_find_enabled(ndev, NULL);
2654                 if (nsvp == NULL) {
2655                         /* not sv device - just return */
2656                         return (0);
2657                 }
2658 
2659                 svp = nsvp;
2660         }
2661 
2662         p_size = svp->sv_nblocks;
2663         if (p_size == 0) {
2664                 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2665                         p_size = (diskaddr_t)svp->sv_nblocks;
2666                         nsc_release(svp->sv_fd);
2667                 } else {
2668                         rc = EINTR;
2669                 }
2670         }
2671 
2672         if (nsvp != NULL) {
2673                 rw_exit(&nsvp->sv_lock);
2674         }
2675 
2676         if ((rc == 0) && ddi_copyout(&p_size,
2677             (void *)(arg + offsetof(struct partition64, p_size)),
2678             sizeof (p_size), mode) != 0) {
2679                 return (EFAULT);
2680         }
2681 
2682         return (rc);
2683 }
2684 #endif /* DKIOCPARTITION */
2685 
2686 
2687 static int
2688 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2689     const int mode, cred_t *crp, int *rvalp)
2690 {
2691         sv_dev_t *svp;
2692         sv_maj_t *maj;
2693         int (*fn)();
2694         int rc = 0;
2695 
2696         maj = 0;
2697         fn = 0;
2698 
2699         /*
2700          * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2701          * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2702          * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2703          *
2704          * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2705          */
2706         if (sv_mod_status == SV_ALLOW_UNLOAD) {
2707                 return (EBUSY);
2708         }
2709 
2710         svp = sv_find_enabled(dev, &maj);
2711         if (svp != NULL) {
2712                 if (nskernd_isdaemon()) {
2713                         /*
2714                          * This is nskernd which always needs to see
2715                          * the underlying disk device accurately.
2716                          *
2717                          * So just pass the ioctl straight through
2718                          * to the underlying driver as though the device
2719                          * was not sv enabled.
2720                          */
2721                         DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2722                             dev_t, dev);
2723 
2724                         rw_exit(&svp->sv_lock);
2725                         svp = NULL;
2726                 } else {
2727                         ASSERT(RW_READ_HELD(&svp->sv_lock));
2728                 }
2729         }
2730 
2731         /*
2732          * We now have a locked and enabled SV device, or a non-SV device.
2733          */
2734 
2735         switch (cmd) {
2736                 /*
2737                  * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2738                  * and DKIOCSETEFI are intercepted and faked up as some
2739                  * i/o providers emulate volumes of a different size to
2740                  * the underlying volume.
2741                  *
2742                  * Setting the size by rewriting the vtoc is not permitted.
2743                  */
2744 
2745         case DKIOCSVTOC:
2746 #ifdef DKIOCPARTITION
2747         case DKIOCSETEFI:
2748 #endif
2749                 if (svp == NULL) {
2750                         /* not intercepted -- allow ioctl through */
2751                         break;
2752                 }
2753 
2754                 rw_exit(&svp->sv_lock);
2755 
2756                 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2757 
2758                 return (EPERM);
2759 
2760         default:
2761                 break;
2762         }
2763 
2764         /*
2765          * Pass through the real ioctl command.
2766          */
2767 
2768         if (maj && (fn = maj->sm_ioctl) != 0) {
2769                 if (!(maj->sm_flag & D_MP)) {
2770                         UNSAFE_ENTER();
2771                         rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2772                         UNSAFE_EXIT();
2773                 } else {
2774                         rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2775                 }
2776         } else {
2777                 rc = ENODEV;
2778         }
2779 
2780         /*
2781          * Bug 4755783
2782          * Fix up the size of the current partition to allow
2783          * for the virtual volume to be a different size to the
2784          * physical volume (e.g. for II compact dependent shadows).
2785          *
2786          * Note that this only attempts to fix up the current partition
2787          * - the one that the ioctl was issued against.  There could be
2788          * other sv'd partitions in the same vtoc, but we cannot tell
2789          * so we don't attempt to fix them up.
2790          */
2791 
2792         if (svp != NULL && rc == 0) {
2793                 switch (cmd) {
2794                 case DKIOCGVTOC:
2795                         rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2796                         break;
2797 
2798 #ifdef DKIOCPARTITION
2799                 case DKIOCGETEFI:
2800                         rc = sv_fix_dkiocgetefi(arg, mode, svp);
2801                         break;
2802 
2803                 case DKIOCPARTITION:
2804                         rc = sv_fix_dkiocpartition(arg, mode, svp);
2805                         break;
2806 #endif /* DKIOCPARTITION */
2807                 }
2808         }
2809 
2810         if (svp != NULL) {
2811                 rw_exit(&svp->sv_lock);
2812         }
2813 
2814         return (rc);
2815 }