1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 
  28 /*
  29  * Storage Volume Character and Block Driver (SV)
  30  *
  31  * This driver implements a simplistic /dev/{r}dsk/ interface to a
  32  * specified disk volume that is otherwise managed by the Prism
  33  * software.  The SV driver layers itself onto the underlying disk
  34  * device driver by changing function pointers in the cb_ops
  35  * structure.
  36  *
  37  * CONFIGURATION:
  38  *
  39  * 1. Configure the driver using the svadm utility.
  40  * 2. Access the device as before through /dev/rdsk/c?t?d?s?
  41  *
  42  * LIMITATIONS:
  43  *
  44  * This driver should NOT be used to share a device between another
  45  * DataServices user interface module (e.g., STE) and a user accessing
  46  * the device through the block device in O_WRITE mode.  This is because
  47  * writes through the block device are asynchronous (due to the page
  48  * cache) and so consistency between the block device user and the
  49  * STE user cannot be guaranteed.
  50  *
  51  * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
  52  * wasteful and slow.
  53  */
  54 
  55 #include <sys/debug.h>
  56 #include <sys/types.h>
  57 
  58 #include <sys/ksynch.h>
  59 #include <sys/kmem.h>
  60 #include <sys/errno.h>
  61 #include <sys/varargs.h>
  62 #include <sys/file.h>
  63 #include <sys/open.h>
  64 #include <sys/conf.h>
  65 #include <sys/cred.h>
  66 #include <sys/buf.h>
  67 #include <sys/uio.h>
  68 #ifndef DS_DDICT
  69 #include <sys/pathname.h>
  70 #endif
  71 #include <sys/aio_req.h>
  72 #include <sys/dkio.h>
  73 #include <sys/vtoc.h>
  74 #include <sys/cmn_err.h>
  75 #include <sys/modctl.h>
  76 #include <sys/ddi.h>
  77 #include <sys/sysmacros.h>
  78 #include <sys/sunddi.h>
  79 #include <sys/sunldi.h>
  80 #include <sys/nsctl/nsvers.h>
  81 
  82 #include <sys/nsc_thread.h>
  83 #include <sys/unistat/spcs_s.h>
  84 #include <sys/unistat/spcs_s_k.h>
  85 #include <sys/unistat/spcs_errors.h>
  86 
  87 #ifdef DS_DDICT
  88 #include "../contract.h"
  89 #endif
  90 
  91 #include "../nsctl.h"
  92 
  93 
  94 #include <sys/sdt.h>              /* dtrace is S10 or later */
  95 
  96 #include "sv.h"
  97 #include "sv_impl.h"
  98 #include "sv_efi.h"
  99 
 100 #define MAX_EINTR_COUNT 1000
 101 
 102 /*
 103  * sv_mod_status
 104  */
 105 #define SV_PREVENT_UNLOAD 1
 106 #define SV_ALLOW_UNLOAD 2
 107 
 108 static const int sv_major_rev = ISS_VERSION_MAJ;        /* Major number */
 109 static const int sv_minor_rev = ISS_VERSION_MIN;        /* Minor number */
 110 static const int sv_micro_rev = ISS_VERSION_MIC;        /* Micro number */
 111 static const int sv_baseline_rev = ISS_VERSION_NUM;     /* Baseline number */
 112 
 113 #ifdef DKIOCPARTITION
 114 /*
 115  * CRC32 polynomial table needed for computing the checksums
 116  * in an EFI vtoc.
 117  */
 118 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
 119 #endif
 120 
 121 static clock_t sv_config_time;          /* Time of successful {en,dis}able */
 122 static int sv_debug;                    /* Set non-zero for debug to syslog */
 123 static int sv_mod_status;               /* Set to prevent modunload */
 124 
 125 static dev_info_t *sv_dip;              /* Single DIP for driver */
 126 static kmutex_t sv_mutex;               /* Protect global lists, etc. */
 127 
 128 static nsc_mem_t        *sv_mem;        /* nsctl memory allocator token */
 129 
 130 
 131 /*
 132  * Per device and per major state.
 133  */
 134 
 135 #ifndef _SunOS_5_6
 136 #define UNSAFE_ENTER()
 137 #define UNSAFE_EXIT()
 138 #else
 139 #define UNSAFE_ENTER()  mutex_enter(&unsafe_driver)
 140 #define UNSAFE_EXIT()   mutex_exit(&unsafe_driver)
 141 #endif
 142 
 143                                         /* hash table of major dev structures */
 144 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
 145 static sv_dev_t *sv_devs;               /* array of per device structures */
 146 static int sv_max_devices;              /* SV version of nsc_max_devices() */
 147 static int sv_ndevices;                 /* number of SV enabled devices */
 148 
 149 /*
 150  * Threading.
 151  */
 152 
 153 int sv_threads_max = 1024;              /* maximum # to dynamically alloc */
 154 int sv_threads = 32;                    /* # to pre-allocate (see sv.conf) */
 155 int sv_threads_extra = 0;               /* addl # we would have alloc'ed */
 156 
 157 static nstset_t *sv_tset;               /* the threadset pointer */
 158 
 159 static int sv_threads_hysteresis = 4;   /* hysteresis for threadset resizing */
 160 static int sv_threads_dev = 2;          /* # of threads to alloc per device */
 161 static int sv_threads_inc = 8;          /* increment for changing the set */
 162 static int sv_threads_needed;           /* number of threads needed */
 163 static int sv_no_threads;               /* number of nsc_create errors */
 164 static int sv_max_nlive;                /* max number of threads running */
 165 
 166 
 167 
 168 /*
 169  * nsctl fd callbacks.
 170  */
 171 
 172 static int svattach_fd(blind_t);
 173 static int svdetach_fd(blind_t);
 174 
 175 static nsc_def_t sv_fd_def[] = {
 176         { "Attach",     (uintptr_t)svattach_fd, },
 177         { "Detach",     (uintptr_t)svdetach_fd, },
 178         { 0, 0, }
 179 };
 180 
 181 /*
 182  * cb_ops functions.
 183  */
 184 
 185 static int svopen(dev_t *, int, int, cred_t *);
 186 static int svclose(dev_t, int, int, cred_t *);
 187 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 188 static int svprint(dev_t, char *);
 189 
 190 /*
 191  * These next functions are layered into the underlying driver's devops.
 192  */
 193 
 194 static int sv_lyr_open(dev_t *, int, int, cred_t *);
 195 static int sv_lyr_close(dev_t, int, int, cred_t *);
 196 static int sv_lyr_strategy(struct buf *);
 197 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
 198 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
 199 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
 200 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
 201 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 202 
 203 static struct cb_ops sv_cb_ops = {
 204         svopen,         /* open */
 205         svclose,        /* close */
 206         nulldev,        /* strategy */
 207         svprint,
 208         nodev,          /* dump */
 209         nodev,          /* read */
 210         nodev,          /* write */
 211         svioctl,
 212         nodev,          /* devmap */
 213         nodev,          /* mmap */
 214         nodev,          /* segmap */
 215         nochpoll,       /* poll */
 216         ddi_prop_op,
 217         NULL,           /* NOT a stream */
 218         D_NEW | D_MP | D_64BIT,
 219         CB_REV,
 220         nodev,          /* aread */
 221         nodev,          /* awrite */
 222 };
 223 
 224 
 225 /*
 226  * dev_ops functions.
 227  */
 228 
 229 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 230 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
 231 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
 232 
 233 static struct dev_ops sv_ops = {
 234         DEVO_REV,
 235         0,
 236         sv_getinfo,
 237         nulldev,        /* identify */
 238         nulldev,        /* probe */
 239         sv_attach,
 240         sv_detach,
 241         nodev,          /* reset */
 242         &sv_cb_ops,
 243         (struct bus_ops *)0
 244 };
 245 
 246 /*
 247  * Module linkage.
 248  */
 249 
 250 extern struct mod_ops mod_driverops;
 251 
 252 static struct modldrv modldrv = {
 253         &mod_driverops,
 254         "nws:Storage Volume:" ISS_VERSION_STR,
 255         &sv_ops
 256 };
 257 
 258 static struct modlinkage modlinkage = {
 259         MODREV_1,
 260         &modldrv,
 261         0
 262 };
 263 
 264 
 265 int
 266 _init(void)
 267 {
 268         int error;
 269 
 270         mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
 271 
 272         if ((error = mod_install(&modlinkage)) != 0) {
 273                 mutex_destroy(&sv_mutex);
 274                 return (error);
 275         }
 276 
 277 #ifdef DEBUG
 278         cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
 279             sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
 280             ISS_VERSION_STR, BUILD_DATE_STR);
 281 #else
 282         if (sv_micro_rev) {
 283                 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
 284                     sv_major_rev, sv_minor_rev, sv_micro_rev,
 285                     ISS_VERSION_STR, BUILD_DATE_STR);
 286         } else {
 287                 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
 288                     sv_major_rev, sv_minor_rev,
 289                     ISS_VERSION_STR, BUILD_DATE_STR);
 290         }
 291 #endif
 292 
 293         return (error);
 294 }
 295 
 296 
 297 int
 298 _fini(void)
 299 {
 300         int error;
 301 
 302         if ((error = mod_remove(&modlinkage)) != 0)
 303                 return (error);
 304 
 305         mutex_destroy(&sv_mutex);
 306 
 307         return (error);
 308 }
 309 
 310 
 311 int
 312 _info(struct modinfo *modinfop)
 313 {
 314         return (mod_info(&modlinkage, modinfop));
 315 }
 316 
 317 
 318 /*
 319  * Locking & State.
 320  *
 321  * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
 322  * threadset creation and sizing; sv_ndevices.
 323  *
 324  * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
 325  * must be acquired first.
 326  *
 327  * sv_lock protects the sv_dev_t structure for an individual device.
 328  *
 329  * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
 330  * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
 331  * first.
 332  *
 333  * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
 334  * I/O operations to a device simultaneously, as above.
 335  *
 336  * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
 337  * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
 338  * and (sv_pending == curthread) so that any recursion through
 339  * sv_lyr_open/sv_lyr_close can be detected.
 340  */
 341 
 342 
 343 static int
 344 sv_init_devs(void)
 345 {
 346         int i;
 347 
 348         ASSERT(MUTEX_HELD(&sv_mutex));
 349 
 350         if (sv_max_devices > 0)
 351                 return (0);
 352 
 353         sv_max_devices = nsc_max_devices();
 354 
 355         if (sv_max_devices <= 0) {
 356                 /* nsctl is not attached (nskernd not running) */
 357                 if (sv_debug > 0)
 358                         cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
 359                 return (EAGAIN);
 360         }
 361 
 362         sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
 363             KM_NOSLEEP, sv_mem);
 364 
 365         if (sv_devs == NULL) {
 366                 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
 367                 return (ENOMEM);
 368         }
 369 
 370         for (i = 0; i < sv_max_devices; i++) {
 371                 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
 372                 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
 373         }
 374 
 375         if (sv_debug > 0)
 376                 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
 377 
 378         return (0);
 379 }
 380 
 381 
 382 static int
 383 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 384 {
 385         int rc;
 386 
 387         switch (cmd) {
 388 
 389         case DDI_ATTACH:
 390                 sv_dip = dip;
 391 
 392                 if (ddi_create_minor_node(dip, "sv", S_IFCHR,
 393                     0, DDI_PSEUDO, 0) != DDI_SUCCESS)
 394                         goto failed;
 395 
 396                 mutex_enter(&sv_mutex);
 397 
 398                 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
 399                 if (sv_mem == NULL) {
 400                         mutex_exit(&sv_mutex);
 401                         goto failed;
 402                 }
 403 
 404                 rc = sv_init_devs();
 405                 if (rc != 0 && rc != EAGAIN) {
 406                         mutex_exit(&sv_mutex);
 407                         goto failed;
 408                 }
 409 
 410                 mutex_exit(&sv_mutex);
 411 
 412 
 413                 ddi_report_dev(dip);
 414 
 415                 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 416                     DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
 417                     "sv_threads", sv_threads);
 418 
 419                 if (sv_debug > 0)
 420                         cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
 421 
 422                 if (sv_threads > sv_threads_max)
 423                         sv_threads_max = sv_threads;
 424 
 425                 return (DDI_SUCCESS);
 426 
 427         default:
 428                 return (DDI_FAILURE);
 429         }
 430 
 431 failed:
 432         DTRACE_PROBE(sv_attach_failed);
 433         (void) sv_detach(dip, DDI_DETACH);
 434         return (DDI_FAILURE);
 435 }
 436 
 437 
 438 static int
 439 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 440 {
 441         sv_dev_t *svp;
 442         int i;
 443 
 444         switch (cmd) {
 445 
 446         case DDI_DETACH:
 447 
 448                 /*
 449                  * Check that everything is disabled.
 450                  */
 451 
 452                 mutex_enter(&sv_mutex);
 453 
 454                 if (sv_mod_status == SV_PREVENT_UNLOAD) {
 455                         mutex_exit(&sv_mutex);
 456                         DTRACE_PROBE(sv_detach_err_prevent);
 457                         return (DDI_FAILURE);
 458                 }
 459 
 460                 for (i = 0; sv_devs && i < sv_max_devices; i++) {
 461                         svp = &sv_devs[i];
 462 
 463                         if (svp->sv_state != SV_DISABLE) {
 464                                 mutex_exit(&sv_mutex);
 465                                 DTRACE_PROBE(sv_detach_err_busy);
 466                                 return (DDI_FAILURE);
 467                         }
 468                 }
 469 
 470 
 471                 for (i = 0; sv_devs && i < sv_max_devices; i++) {
 472                         mutex_destroy(&sv_devs[i].sv_olock);
 473                         rw_destroy(&sv_devs[i].sv_lock);
 474                 }
 475 
 476                 if (sv_devs) {
 477                         nsc_kmem_free(sv_devs,
 478                             (sv_max_devices * sizeof (*sv_devs)));
 479                         sv_devs = NULL;
 480                 }
 481                 sv_max_devices = 0;
 482 
 483                 if (sv_mem) {
 484                         nsc_unregister_mem(sv_mem);
 485                         sv_mem = NULL;
 486                 }
 487 
 488                 mutex_exit(&sv_mutex);
 489 
 490                 /*
 491                  * Remove all minor nodes.
 492                  */
 493 
 494                 ddi_remove_minor_node(dip, NULL);
 495                 sv_dip = NULL;
 496 
 497                 return (DDI_SUCCESS);
 498 
 499         default:
 500                 return (DDI_FAILURE);
 501         }
 502 }
 503 
 504 static sv_maj_t *
 505 sv_getmajor(const dev_t dev)
 506 {
 507         sv_maj_t **insert, *maj;
 508         major_t umaj = getmajor(dev);
 509 
 510         /*
 511          * See if the hash table entry, or one of the hash chains
 512          * is already allocated for this major number
 513          */
 514         if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
 515                 do {
 516                         if (maj->sm_major == umaj)
 517                                 return (maj);
 518                 } while ((maj = maj->sm_next) != 0);
 519         }
 520 
 521         /*
 522          * If the sv_mutex is held, there is design flaw, as the only non-mutex
 523          * held callers can be sv_enable() or sv_dev_to_sv()
 524          * Return an error, instead of panicing the system
 525          */
 526         if (MUTEX_HELD(&sv_mutex)) {
 527                 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
 528                 return (NULL);
 529         }
 530 
 531         /*
 532          * Determine where to allocate a new element in the hash table
 533          */
 534         mutex_enter(&sv_mutex);
 535         insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
 536         for (maj = *insert; maj; maj = maj->sm_next) {
 537 
 538                 /* Did another thread beat us to it? */
 539                 if (maj->sm_major == umaj)
 540                         return (maj);
 541 
 542                 /* Find a NULL insert point? */
 543                 if (maj->sm_next == NULL)
 544                         insert = &maj->sm_next;
 545         }
 546 
 547         /*
 548          * Located the new insert point
 549          */
 550         *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
 551         if ((maj = *insert) != 0)
 552                 maj->sm_major = umaj;
 553         else
 554                 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
 555 
 556         mutex_exit(&sv_mutex);
 557 
 558         return (maj);
 559 }
 560 
 561 /* ARGSUSED */
 562 
 563 static int
 564 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 565 {
 566         int rc = DDI_FAILURE;
 567 
 568         switch (infocmd) {
 569 
 570         case DDI_INFO_DEVT2DEVINFO:
 571                 *result = sv_dip;
 572                 rc = DDI_SUCCESS;
 573                 break;
 574 
 575         case DDI_INFO_DEVT2INSTANCE:
 576                 /*
 577                  * We only have a single instance.
 578                  */
 579                 *result = 0;
 580                 rc = DDI_SUCCESS;
 581                 break;
 582 
 583         default:
 584                 break;
 585         }
 586 
 587         return (rc);
 588 }
 589 
 590 
 591 /*
 592  * Hashing of devices onto major device structures.
 593  *
 594  * Individual device structures are hashed onto one of the sm_hash[]
 595  * buckets in the relevant major device structure.
 596  *
 597  * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
 598  * searching does not require the mutex because of the sm_seq member.
 599  * sm_seq is incremented on each insertion (-after- hash chain pointer
 600  * manipulation) and each deletion (-before- hash chain pointer
 601  * manipulation).  When searching the hash chain, the seq number is
 602  * checked before accessing each device structure, if the seq number has
 603  * changed, then we restart the search from the top of the hash chain.
 604  * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
 605  * the hash chain (we are guaranteed that this search cannot be
 606  * interrupted).
 607  */
 608 
 609 #define SV_HASH_RETRY   16
 610 
 611 static sv_dev_t *
 612 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
 613 {
 614         minor_t umin = getminor(dev);
 615         sv_dev_t **hb, *next, *svp;
 616         sv_maj_t *maj;
 617         int seq;
 618         int try;
 619 
 620         /* Get major hash table */
 621         maj = sv_getmajor(dev);
 622         if (majpp)
 623                 *majpp = maj;
 624         if (maj == NULL)
 625                 return (NULL);
 626 
 627         if (maj->sm_inuse == 0) {
 628                 DTRACE_PROBE1(
 629                     sv_dev_to_sv_end,
 630                     dev_t, dev);
 631                 return (NULL);
 632         }
 633 
 634         hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
 635         try = 0;
 636 
 637 retry:
 638         if (try > SV_HASH_RETRY)
 639                 mutex_enter(&sv_mutex);
 640 
 641         seq = maj->sm_seq;
 642         for (svp = *hb; svp; svp = next) {
 643                 next = svp->sv_hash;
 644 
 645                 nsc_membar_stld();      /* preserve register load order */
 646 
 647                 if (maj->sm_seq != seq) {
 648                         DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
 649                         try++;
 650                         goto retry;
 651                 }
 652 
 653                 if (svp->sv_dev == dev)
 654                         break;
 655         }
 656 
 657         if (try > SV_HASH_RETRY)
 658                 mutex_exit(&sv_mutex);
 659 
 660         return (svp);
 661 }
 662 
 663 
 664 /*
 665  * Must be called with sv_mutex held.
 666  */
 667 
 668 static int
 669 sv_get_state(const dev_t udev, sv_dev_t **svpp)
 670 {
 671         sv_dev_t **hb, **insert, *svp;
 672         sv_maj_t *maj;
 673         minor_t umin;
 674         int i;
 675 
 676         /* Get major hash table */
 677         if ((maj = sv_getmajor(udev)) == NULL)
 678                 return (NULL);
 679 
 680         /* Determine which minor hash table */
 681         umin = getminor(udev);
 682         hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
 683 
 684         /* look for clash */
 685 
 686         insert = hb;
 687 
 688         for (svp = *hb; svp; svp = svp->sv_hash) {
 689                 if (svp->sv_dev == udev)
 690                         break;
 691 
 692                 if (svp->sv_hash == NULL)
 693                         insert = &svp->sv_hash;
 694         }
 695 
 696         if (svp) {
 697                 DTRACE_PROBE1(
 698                     sv_get_state_enabled,
 699                     dev_t, udev);
 700                 return (SV_EENABLED);
 701         }
 702 
 703         /* look for spare sv_devs slot */
 704 
 705         for (i = 0; i < sv_max_devices; i++) {
 706                 svp = &sv_devs[i];
 707 
 708                 if (svp->sv_state == SV_DISABLE)
 709                         break;
 710         }
 711 
 712         if (i >= sv_max_devices) {
 713                 DTRACE_PROBE1(
 714                     sv_get_state_noslots,
 715                     dev_t, udev);
 716                 return (SV_ENOSLOTS);
 717         }
 718 
 719         svp->sv_state = SV_PENDING;
 720         svp->sv_pending = curthread;
 721 
 722         *insert = svp;
 723         svp->sv_hash = NULL;
 724         maj->sm_seq++;               /* must be after the store to the hash chain */
 725 
 726         *svpp = svp;
 727 
 728         /*
 729          * We do not know the size of the underlying device at
 730          * this stage, so initialise "nblocks" property to
 731          * zero, and update it whenever we succeed in
 732          * nsc_reserve'ing the underlying nsc_fd_t.
 733          */
 734 
 735         svp->sv_nblocks = 0;
 736 
 737         return (0);
 738 }
 739 
 740 
 741 /*
 742  * Remove a device structure from it's hash chain.
 743  * Must be called with sv_mutex held.
 744  */
 745 
 746 static void
 747 sv_rm_hash(sv_dev_t *svp)
 748 {
 749         sv_dev_t **svpp;
 750         sv_maj_t *maj;
 751 
 752         /* Get major hash table */
 753         if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
 754                 return;
 755 
 756         /* remove svp from hash chain */
 757 
 758         svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
 759         while (*svpp) {
 760                 if (*svpp == svp) {
 761                         /*
 762                          * increment of sm_seq must be before the
 763                          * removal from the hash chain
 764                          */
 765                         maj->sm_seq++;
 766                         *svpp = svp->sv_hash;
 767                         break;
 768                 }
 769 
 770                 svpp = &(*svpp)->sv_hash;
 771         }
 772 
 773         svp->sv_hash = NULL;
 774 }
 775 
 776 /*
 777  * Free (disable) a device structure.
 778  * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
 779  * perform the exits during its processing.
 780  */
 781 
 782 static int
 783 sv_free(sv_dev_t *svp, const int error)
 784 {
 785         struct cb_ops *cb_ops;
 786         sv_maj_t *maj;
 787 
 788         /* Get major hash table */
 789         if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
 790                 return (NULL);
 791 
 792         svp->sv_state = SV_PENDING;
 793         svp->sv_pending = curthread;
 794 
 795         /*
 796          * Close the fd's before removing from the hash or swapping
 797          * back the cb_ops pointers so that the cache flushes before new
 798          * io can come in.
 799          */
 800 
 801         if (svp->sv_fd) {
 802                 (void) nsc_close(svp->sv_fd);
 803                 svp->sv_fd = 0;
 804         }
 805 
 806         sv_rm_hash(svp);
 807 
 808         if (error != SV_ESDOPEN &&
 809             error != SV_ELYROPEN && --maj->sm_inuse == 0) {
 810 
 811                 if (maj->sm_dev_ops)
 812                         cb_ops = maj->sm_dev_ops->devo_cb_ops;
 813                 else
 814                         cb_ops = NULL;
 815 
 816                 if (cb_ops && maj->sm_strategy != NULL) {
 817                         cb_ops->cb_strategy = maj->sm_strategy;
 818                         cb_ops->cb_close = maj->sm_close;
 819                         cb_ops->cb_ioctl = maj->sm_ioctl;
 820                         cb_ops->cb_write = maj->sm_write;
 821                         cb_ops->cb_open = maj->sm_open;
 822                         cb_ops->cb_read = maj->sm_read;
 823                         cb_ops->cb_flag = maj->sm_flag;
 824 
 825                         if (maj->sm_awrite)
 826                                 cb_ops->cb_awrite = maj->sm_awrite;
 827 
 828                         if (maj->sm_aread)
 829                                 cb_ops->cb_aread = maj->sm_aread;
 830 
 831                         /*
 832                          * corbin XXX
 833                          * Leave backing device ops in maj->sm_*
 834                          * to handle any requests that might come
 835                          * in during the disable.  This could be
 836                          * a problem however if the backing device
 837                          * driver is changed while we process these
 838                          * requests.
 839                          *
 840                          * maj->sm_strategy = 0;
 841                          * maj->sm_awrite = 0;
 842                          * maj->sm_write = 0;
 843                          * maj->sm_ioctl = 0;
 844                          * maj->sm_close = 0;
 845                          * maj->sm_aread = 0;
 846                          * maj->sm_read = 0;
 847                          * maj->sm_open = 0;
 848                          * maj->sm_flag = 0;
 849                          *
 850                          */
 851                 }
 852 
 853                 if (maj->sm_dev_ops) {
 854                         maj->sm_dev_ops = 0;
 855                 }
 856         }
 857 
 858         if (svp->sv_lh) {
 859                 cred_t *crp = ddi_get_cred();
 860 
 861                 /*
 862                  * Close the protective layered driver open using the
 863                  * Sun Private layered driver i/f.
 864                  */
 865 
 866                 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
 867                 svp->sv_lh = NULL;
 868         }
 869 
 870         svp->sv_timestamp = nsc_lbolt();
 871         svp->sv_state = SV_DISABLE;
 872         svp->sv_pending = NULL;
 873         rw_exit(&svp->sv_lock);
 874         mutex_exit(&sv_mutex);
 875 
 876         return (error);
 877 }
 878 
 879 /*
 880  * Reserve the device, taking into account the possibility that
 881  * the reserve might have to be retried.
 882  */
 883 static int
 884 sv_reserve(nsc_fd_t *fd, int flags)
 885 {
 886         int eintr_count;
 887         int rc;
 888 
 889         eintr_count = 0;
 890         do {
 891                 rc = nsc_reserve(fd, flags);
 892                 if (rc == EINTR) {
 893                         ++eintr_count;
 894                         delay(2);
 895                 }
 896         } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
 897 
 898         return (rc);
 899 }
 900 
 901 static int
 902 sv_enable(const caddr_t path, const int flag,
 903     const dev_t udev, spcs_s_info_t kstatus)
 904 {
 905         struct dev_ops *dev_ops;
 906         struct cb_ops *cb_ops;
 907         sv_dev_t *svp;
 908         sv_maj_t *maj;
 909         nsc_size_t nblocks;
 910         int rc;
 911         cred_t *crp;
 912         ldi_ident_t     li;
 913 
 914         if (udev == (dev_t)-1 || udev == 0) {
 915                 DTRACE_PROBE1(
 916                     sv_enable_err_baddev,
 917                     dev_t, udev);
 918                 return (SV_EBADDEV);
 919         }
 920 
 921         if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
 922                 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
 923                 return (SV_EAMODE);
 924         }
 925 
 926         /* Get major hash table */
 927         if ((maj = sv_getmajor(udev)) == NULL)
 928                 return (SV_EBADDEV);
 929 
 930         mutex_enter(&sv_mutex);
 931 
 932         rc = sv_get_state(udev, &svp);
 933         if (rc) {
 934                 mutex_exit(&sv_mutex);
 935                 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
 936                 return (rc);
 937         }
 938 
 939         rw_enter(&svp->sv_lock, RW_WRITER);
 940 
 941         /*
 942          * Get real fd used for io
 943          */
 944 
 945         svp->sv_dev = udev;
 946         svp->sv_flag = flag;
 947 
 948         /*
 949          * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
 950          * function pointer before sv swaps them out.
 951          */
 952 
 953         svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
 954             sv_fd_def, (blind_t)udev, &rc);
 955 
 956         if (svp->sv_fd == NULL) {
 957                 if (kstatus)
 958                         spcs_s_add(kstatus, rc);
 959                 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
 960                 return (sv_free(svp, SV_ESDOPEN));
 961         }
 962 
 963         /*
 964          * Perform a layered driver open using the Sun Private layered
 965          * driver i/f to ensure that the cb_ops structure for the driver
 966          * is not detached out from under us whilst sv is enabled.
 967          *
 968          */
 969 
 970         crp = ddi_get_cred();
 971         svp->sv_lh = NULL;
 972 
 973         if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
 974                 rc = ldi_open_by_dev(&svp->sv_dev,
 975                     OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
 976         }
 977 
 978         if (rc != 0) {
 979                 if (kstatus)
 980                         spcs_s_add(kstatus, rc);
 981                 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
 982                 return (sv_free(svp, SV_ELYROPEN));
 983         }
 984 
 985         /*
 986          * Do layering if required - must happen after nsc_open().
 987          */
 988 
 989         if (maj->sm_inuse++ == 0) {
 990                 maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
 991 
 992                 if (maj->sm_dev_ops == NULL ||
 993                     maj->sm_dev_ops->devo_cb_ops == NULL) {
 994                         DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
 995                         return (sv_free(svp, SV_ELOAD));
 996                 }
 997 
 998                 dev_ops = maj->sm_dev_ops;
 999                 cb_ops = dev_ops->devo_cb_ops;
1000 
1001                 if (cb_ops->cb_strategy == NULL ||
1002                     cb_ops->cb_strategy == nodev ||
1003                     cb_ops->cb_strategy == nulldev) {
1004                         DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1005                         return (sv_free(svp, SV_ELOAD));
1006                 }
1007 
1008                 if (cb_ops->cb_strategy == sv_lyr_strategy) {
1009                         DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1010                         return (sv_free(svp, SV_ESTRATEGY));
1011                 }
1012 
1013                 maj->sm_strategy = cb_ops->cb_strategy;
1014                 maj->sm_close = cb_ops->cb_close;
1015                 maj->sm_ioctl = cb_ops->cb_ioctl;
1016                 maj->sm_write = cb_ops->cb_write;
1017                 maj->sm_open = cb_ops->cb_open;
1018                 maj->sm_read = cb_ops->cb_read;
1019                 maj->sm_flag = cb_ops->cb_flag;
1020 
1021                 cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1022                 cb_ops->cb_strategy = sv_lyr_strategy;
1023                 cb_ops->cb_close = sv_lyr_close;
1024                 cb_ops->cb_ioctl = sv_lyr_ioctl;
1025                 cb_ops->cb_write = sv_lyr_write;
1026                 cb_ops->cb_open = sv_lyr_open;
1027                 cb_ops->cb_read = sv_lyr_read;
1028 
1029                 /*
1030                  * Check that the driver has async I/O entry points
1031                  * before changing them.
1032                  */
1033 
1034                 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1035                         maj->sm_awrite = 0;
1036                         maj->sm_aread = 0;
1037                 } else {
1038                         maj->sm_awrite = cb_ops->cb_awrite;
1039                         maj->sm_aread = cb_ops->cb_aread;
1040 
1041                         cb_ops->cb_awrite = sv_lyr_awrite;
1042                         cb_ops->cb_aread = sv_lyr_aread;
1043                 }
1044 
1045                 /*
1046                  * Bug 4645743
1047                  *
1048                  * Prevent sv from ever unloading after it has interposed
1049                  * on a major device because there is a race between
1050                  * sv removing its layered entry points from the target
1051                  * dev_ops, a client coming in and accessing the driver,
1052                  * and the kernel modunloading the sv text.
1053                  *
1054                  * To allow unload, do svboot -u, which only happens in
1055                  * pkgrm time.
1056                  */
1057                 ASSERT(MUTEX_HELD(&sv_mutex));
1058                 sv_mod_status = SV_PREVENT_UNLOAD;
1059         }
1060 
1061 
1062         svp->sv_timestamp = nsc_lbolt();
1063         svp->sv_state = SV_ENABLE;
1064         svp->sv_pending = NULL;
1065         rw_exit(&svp->sv_lock);
1066 
1067         sv_ndevices++;
1068         mutex_exit(&sv_mutex);
1069 
1070         nblocks = 0;
1071         if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1072                 nblocks = svp->sv_nblocks;
1073                 nsc_release(svp->sv_fd);
1074         }
1075 
1076         cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1077             svp->sv_dev, nblocks);
1078 
1079         return (0);
1080 }
1081 
1082 
1083 static int
1084 sv_prepare_unload()
1085 {
1086         int rc = 0;
1087 
1088         mutex_enter(&sv_mutex);
1089 
1090         if (sv_mod_status == SV_PREVENT_UNLOAD) {
1091                 if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1092                         rc = EBUSY;
1093                 } else {
1094                         sv_mod_status = SV_ALLOW_UNLOAD;
1095                         delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1096                 }
1097         }
1098 
1099         mutex_exit(&sv_mutex);
1100         return (rc);
1101 }
1102 
1103 static int
1104 svattach_fd(blind_t arg)
1105 {
1106         dev_t dev = (dev_t)arg;
1107         sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1108         int rc;
1109 
1110         if (sv_debug > 0)
1111                 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1112 
1113         if (svp == NULL) {
1114                 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1115                 return (0);
1116         }
1117 
1118         if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1119                 cmn_err(CE_WARN,
1120                     "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1121                 svp->sv_nblocks = 0;
1122         }
1123 
1124         if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1125                 cmn_err(CE_WARN,
1126                     "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1127                 svp->sv_maxfbas = 0;
1128         }
1129 
1130         if (sv_debug > 0) {
1131                 cmn_err(CE_CONT,
1132                     "!svattach_fd(%p): size %" NSC_SZFMT ", "
1133                     "maxfbas %" NSC_SZFMT "\n",
1134                     arg, svp->sv_nblocks, svp->sv_maxfbas);
1135         }
1136 
1137         return (0);
1138 }
1139 
1140 
1141 static int
1142 svdetach_fd(blind_t arg)
1143 {
1144         dev_t dev = (dev_t)arg;
1145         sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1146 
1147         if (sv_debug > 0)
1148                 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1149 
1150         /* svp can be NULL during disable of an sv */
1151         if (svp == NULL)
1152                 return (0);
1153 
1154         svp->sv_maxfbas = 0;
1155         svp->sv_nblocks = 0;
1156         return (0);
1157 }
1158 
1159 
1160 /*
1161  * Side effect: if called with (guard != 0), then expects both sv_mutex
1162  * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1163  */
1164 
1165 /* ARGSUSED */
1166 static int
1167 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1168 {
1169         sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1170 
1171         if (svp == NULL) {
1172 
1173                 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1174                 return (SV_ENODEV);
1175         }
1176 
1177         mutex_enter(&sv_mutex);
1178         rw_enter(&svp->sv_lock, RW_WRITER);
1179 
1180         if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1181                 rw_exit(&svp->sv_lock);
1182                 mutex_exit(&sv_mutex);
1183 
1184                 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1185                 return (SV_EDISABLED);
1186         }
1187 
1188 
1189         sv_ndevices--;
1190         return (sv_free(svp, 0));
1191 }
1192 
1193 
1194 
1195 static int
1196 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1197 {
1198         nsc_buf_t *tmph;
1199         sv_dev_t *svp;
1200         sv_maj_t *maj;
1201         int (*fn)();
1202         dev_t odev;
1203         int ret;
1204         int rc;
1205 
1206         svp = sv_dev_to_sv(*devp, &maj);
1207 
1208         if (svp) {
1209                 if (svp->sv_state == SV_PENDING &&
1210                     svp->sv_pending == curthread) {
1211                         /*
1212                          * This is a recursive open from a call to
1213                          * ddi_lyr_open_by_devt and so we just want
1214                          * to pass it straight through to the
1215                          * underlying driver.
1216                          */
1217                         DTRACE_PROBE2(sv_lyr_open_recursive,
1218                             sv_dev_t *, svp,
1219                             dev_t, *devp);
1220                         svp = NULL;
1221                 } else
1222                         rw_enter(&svp->sv_lock, RW_READER);
1223         }
1224 
1225         odev = *devp;
1226 
1227         if (maj && (fn = maj->sm_open) != 0) {
1228                 if (!(maj->sm_flag & D_MP)) {
1229                         UNSAFE_ENTER();
1230                         ret = (*fn)(devp, flag, otyp, crp);
1231                         UNSAFE_EXIT();
1232                 } else {
1233                         ret = (*fn)(devp, flag, otyp, crp);
1234                 }
1235 
1236                 if (ret == 0) {
1237                         /*
1238                          * Re-acquire svp if the driver changed *devp.
1239                          */
1240 
1241                         if (*devp != odev) {
1242                                 if (svp != NULL)
1243                                         rw_exit(&svp->sv_lock);
1244 
1245                                 svp = sv_dev_to_sv(*devp, NULL);
1246 
1247                                 if (svp) {
1248                                         rw_enter(&svp->sv_lock, RW_READER);
1249                                 }
1250                         }
1251                 }
1252         } else {
1253                 ret = ENODEV;
1254         }
1255 
1256         if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1257                 /*
1258                  * Underlying DDI open failed, but we have this
1259                  * device SV enabled.  If we can read some data
1260                  * from the device, fake a successful open (this
1261                  * probably means that this device is RDC'd and we
1262                  * are getting the data from the secondary node).
1263                  *
1264                  * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1265                  * ensure that it does not deadlock if this open is
1266                  * coming from nskernd:get_bsize().
1267                  */
1268                 rc = sv_reserve(svp->sv_fd,
1269                     NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1270                 if (rc == 0) {
1271                         tmph = NULL;
1272 
1273                         rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1274                         if (rc <= 0) {
1275                                 /* success */
1276                                 ret = 0;
1277                         }
1278 
1279                         if (tmph) {
1280                                 (void) nsc_free_buf(tmph);
1281                                 tmph = NULL;
1282                         }
1283 
1284                         nsc_release(svp->sv_fd);
1285 
1286                         /*
1287                          * Count the number of layered opens that we
1288                          * fake since we have to fake a matching number
1289                          * of closes (OTYP_LYR open/close calls must be
1290                          * paired).
1291                          */
1292 
1293                         if (ret == 0 && otyp == OTYP_LYR) {
1294                                 mutex_enter(&svp->sv_olock);
1295                                 svp->sv_openlcnt++;
1296                                 mutex_exit(&svp->sv_olock);
1297                         }
1298                 }
1299         }
1300 
1301         if (svp) {
1302                 rw_exit(&svp->sv_lock);
1303         }
1304 
1305         return (ret);
1306 }
1307 
1308 
1309 static int
1310 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1311 {
1312         sv_dev_t *svp;
1313         sv_maj_t *maj;
1314         int (*fn)();
1315         int ret;
1316 
1317         svp = sv_dev_to_sv(dev, &maj);
1318 
1319         if (svp &&
1320             svp->sv_state == SV_PENDING &&
1321             svp->sv_pending == curthread) {
1322                 /*
1323                  * This is a recursive open from a call to
1324                  * ddi_lyr_close and so we just want
1325                  * to pass it straight through to the
1326                  * underlying driver.
1327                  */
1328                 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1329                     dev_t, dev);
1330                 svp = NULL;
1331         }
1332 
1333         if (svp) {
1334                 rw_enter(&svp->sv_lock, RW_READER);
1335 
1336                 if (otyp == OTYP_LYR) {
1337                         mutex_enter(&svp->sv_olock);
1338 
1339                         if (svp->sv_openlcnt) {
1340                                 /*
1341                                  * Consume sufficient layered closes to
1342                                  * account for the opens that we faked
1343                                  * whilst the device was failed.
1344                                  */
1345                                 svp->sv_openlcnt--;
1346                                 mutex_exit(&svp->sv_olock);
1347                                 rw_exit(&svp->sv_lock);
1348 
1349                                 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1350 
1351                                 return (0);
1352                         }
1353 
1354                         mutex_exit(&svp->sv_olock);
1355                 }
1356         }
1357 
1358         if (maj && (fn = maj->sm_close) != 0) {
1359                 if (!(maj->sm_flag & D_MP)) {
1360                         UNSAFE_ENTER();
1361                         ret = (*fn)(dev, flag, otyp, crp);
1362                         UNSAFE_EXIT();
1363                 } else {
1364                         ret = (*fn)(dev, flag, otyp, crp);
1365                 }
1366         } else {
1367                 ret = ENODEV;
1368         }
1369 
1370         if (svp) {
1371                 rw_exit(&svp->sv_lock);
1372         }
1373 
1374         return (ret);
1375 }
1376 
1377 
1378 /*
1379  * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1380  * return NULL.
1381  */
1382 static sv_dev_t *
1383 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1384 {
1385         sv_dev_t *svp;
1386 
1387         while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1388                 rw_enter(&svp->sv_lock, RW_READER);
1389 
1390                 if (svp->sv_state == SV_ENABLE) {
1391                         /* locked and enabled */
1392                         break;
1393                 }
1394 
1395                 /*
1396                  * State was changed while waiting on the lock.
1397                  * Wait for a stable state.
1398                  */
1399                 rw_exit(&svp->sv_lock);
1400 
1401                 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1402 
1403                 delay(2);
1404         }
1405 
1406         return (svp);
1407 }
1408 
1409 
1410 static int
1411 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1412 {
1413         sv_dev_t *svp;
1414         sv_maj_t *maj;
1415         int (*fn)();
1416         int rc;
1417 
1418         svp = sv_find_enabled(dev, &maj);
1419         if (svp == NULL) {
1420                 if (maj) {
1421                         if (rw == NSC_READ)
1422                                 fn = maj->sm_read;
1423                         else
1424                                 fn = maj->sm_write;
1425 
1426                         if (fn != 0) {
1427                                 if (!(maj->sm_flag & D_MP)) {
1428                                         UNSAFE_ENTER();
1429                                         rc = (*fn)(dev, uiop, crp);
1430                                         UNSAFE_EXIT();
1431                                 } else {
1432                                         rc = (*fn)(dev, uiop, crp);
1433                                 }
1434                         }
1435 
1436                         return (rc);
1437                 } else {
1438                         return (ENODEV);
1439                 }
1440         }
1441 
1442         ASSERT(RW_READ_HELD(&svp->sv_lock));
1443 
1444         if (svp->sv_flag == 0) {
1445                 /*
1446                  * guard access mode
1447                  * - prevent user level access to the device
1448                  */
1449                 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1450                 rc = EPERM;
1451                 goto out;
1452         }
1453 
1454         if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1455                 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1456                 goto out;
1457         }
1458 
1459         if (rw == NSC_READ)
1460                 rc = nsc_uread(svp->sv_fd, uiop, crp);
1461         else
1462                 rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1463 
1464         nsc_release(svp->sv_fd);
1465 
1466 out:
1467         rw_exit(&svp->sv_lock);
1468 
1469         return (rc);
1470 }
1471 
1472 
1473 static int
1474 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1475 {
1476         return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1477 }
1478 
1479 
1480 static int
1481 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1482 {
1483         return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1484 }
1485 
1486 
1487 /* ARGSUSED */
1488 
1489 static int
1490 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1491 {
1492         return (aphysio(sv_lyr_strategy,
1493             anocancel, dev, B_READ, minphys, aio));
1494 }
1495 
1496 
1497 /* ARGSUSED */
1498 
1499 static int
1500 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1501 {
1502         return (aphysio(sv_lyr_strategy,
1503             anocancel, dev, B_WRITE, minphys, aio));
1504 }
1505 
1506 
1507 /*
1508  * Set up an array containing the list of raw path names
1509  * The array for the paths is svl and the size of the array is
1510  * in size.
1511  *
1512  * If there are more layered devices than will fit in the array,
1513  * the number of extra layered devices is returned.  Otherwise
1514  * zero is return.
1515  *
1516  * Input:
1517  *      svn     : array for paths
1518  *      size    : size of the array
1519  *
1520  * Output (extra):
1521  *      zero    : All paths fit in array
1522  *      >0   : Number of defined layered devices don't fit in array
1523  */
1524 
1525 static int
1526 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1527 {
1528         sv_name32_t *svn32;
1529         sv_name_t *svn;
1530         sv_dev_t *svp;
1531         int *mode, *nblocks;
1532         int i, index;
1533         char *path;
1534 
1535         *extra = 0;
1536         index = 0;
1537 
1538         if (ilp32)
1539                 svn32 = ptr;
1540         else
1541                 svn = ptr;
1542 
1543         mutex_enter(&sv_mutex);
1544         for (i = 0; i < sv_max_devices; i++) {
1545                 svp = &sv_devs[i];
1546 
1547                 rw_enter(&svp->sv_lock, RW_READER);
1548 
1549                 if (svp->sv_state != SV_ENABLE) {
1550                         rw_exit(&svp->sv_lock);
1551                         continue;
1552                 }
1553 
1554                 if ((*extra) != 0 || ptr == NULL) {
1555                         /* Another overflow entry */
1556                         rw_exit(&svp->sv_lock);
1557                         (*extra)++;
1558                         continue;
1559                 }
1560 
1561                 if (ilp32) {
1562                         nblocks = &svn32->svn_nblocks;
1563                         mode = &svn32->svn_mode;
1564                         path = svn32->svn_path;
1565 
1566                         svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1567                         svn32++;
1568                 } else {
1569                         nblocks = &svn->svn_nblocks;
1570                         mode = &svn->svn_mode;
1571                         path = svn->svn_path;
1572 
1573                         svn->svn_timestamp = svp->sv_timestamp;
1574                         svn++;
1575                 }
1576 
1577                 (void) strcpy(path, nsc_pathname(svp->sv_fd));
1578                 *nblocks = svp->sv_nblocks;
1579                 *mode = svp->sv_flag;
1580 
1581                 if (*nblocks == 0) {
1582                         if (sv_debug > 3)
1583                                 cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1584 
1585                         if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1586                                 *nblocks = svp->sv_nblocks;
1587                                 nsc_release(svp->sv_fd);
1588                         }
1589                 }
1590 
1591                 if (++index >= size) {
1592                         /* Out of space */
1593                         (*extra)++;
1594                 }
1595 
1596                 rw_exit(&svp->sv_lock);
1597         }
1598         mutex_exit(&sv_mutex);
1599 
1600         if (index < size) {
1601                 /* NULL terminated list */
1602                 if (ilp32)
1603                         svn32->svn_path[0] = '\0';
1604                 else
1605                         svn->svn_path[0] = '\0';
1606         }
1607 
1608         return (0);
1609 }
1610 
1611 
1612 static void
1613 sv_thread_tune(int threads)
1614 {
1615         int incr = (threads > 0) ? 1 : -1;
1616         int change = 0;
1617         int nthreads;
1618 
1619         ASSERT(MUTEX_HELD(&sv_mutex));
1620 
1621         if (sv_threads_extra) {
1622                 /* keep track of any additional threads requested */
1623                 if (threads > 0) {
1624                         sv_threads_extra += threads;
1625                         return;
1626                 }
1627                 threads = -threads;
1628                 if (threads >= sv_threads_extra) {
1629                         threads -= sv_threads_extra;
1630                         sv_threads_extra = 0;
1631                         /* fall through to while loop */
1632                 } else {
1633                         sv_threads_extra -= threads;
1634                         return;
1635                 }
1636         } else if (threads > 0) {
1637                 /*
1638                  * do not increase the number of threads beyond
1639                  * sv_threads_max when doing dynamic thread tuning
1640                  */
1641                 nthreads = nst_nthread(sv_tset);
1642                 if ((nthreads + threads) > sv_threads_max) {
1643                         sv_threads_extra = nthreads + threads - sv_threads_max;
1644                         threads = sv_threads_max - nthreads;
1645                         if (threads <= 0)
1646                                 return;
1647                 }
1648         }
1649 
1650         if (threads < 0)
1651                 threads = -threads;
1652 
1653         while (threads--) {
1654                 nthreads = nst_nthread(sv_tset);
1655                 sv_threads_needed += incr;
1656 
1657                 if (sv_threads_needed >= nthreads)
1658                         change += nst_add_thread(sv_tset, sv_threads_inc);
1659                 else if ((sv_threads_needed <
1660                     (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1661                     ((nthreads - sv_threads_inc) >= sv_threads))
1662                         change -= nst_del_thread(sv_tset, sv_threads_inc);
1663         }
1664 
1665 #ifdef DEBUG
1666         if (change) {
1667                 cmn_err(CE_NOTE,
1668                     "!sv_thread_tune: threads needed %d, nthreads %d, "
1669                     "nthreads change %d",
1670                     sv_threads_needed, nst_nthread(sv_tset), change);
1671         }
1672 #endif
1673 }
1674 
1675 
1676 /* ARGSUSED */
1677 static int
1678 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1679 {
1680         int rc;
1681 
1682         mutex_enter(&sv_mutex);
1683         rc = sv_init_devs();
1684         mutex_exit(&sv_mutex);
1685 
1686         return (rc);
1687 }
1688 
1689 
1690 /* ARGSUSED */
1691 static int
1692 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1693 {
1694         const int secs = HZ * 5;
1695         const int ticks = HZ / 10;
1696         int loops = secs / ticks;
1697 
1698         mutex_enter(&sv_mutex);
1699         while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1700                 if (nst_nlive(sv_tset) <= 0) {
1701                         nst_destroy(sv_tset);
1702                         sv_tset = NULL;
1703                         break;
1704                 }
1705 
1706                 /* threads still active - wait for them to exit */
1707                 mutex_exit(&sv_mutex);
1708                 delay(ticks);
1709                 loops--;
1710                 mutex_enter(&sv_mutex);
1711         }
1712         mutex_exit(&sv_mutex);
1713 
1714         if (loops <= 0) {
1715                 cmn_err(CE_WARN,
1716 #ifndef DEBUG
1717                     /* do not write to console when non-DEBUG */
1718                     "!"
1719 #endif
1720                     "sv:svclose: threads still active "
1721                     "after %d sec - leaking thread set", secs);
1722         }
1723 
1724         return (0);
1725 }
1726 
1727 
1728 static int
1729 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1730 {
1731         char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1732         spcs_s_info_t kstatus;  /* Kernel version of spcs status */
1733         spcs_s_info_t ustatus;  /* Address of user version of spcs status */
1734         sv_list32_t svl32;      /* 32 bit Initial structure for SVIOC_LIST */
1735         sv_version_t svv;       /* Version structure */
1736         sv_conf_t svc;          /* User config structure */
1737         sv_list_t svl;          /* Initial structure for SVIOC_LIST */
1738         void *usvn;             /* Address of user sv_name_t */
1739         void *svn = NULL;       /* Array for SVIOC_LIST */
1740         uint64_t phash;         /* pathname hash */
1741         int rc = 0;             /* Return code -- errno */
1742         int size;               /* Number of items in array */
1743         int bytes;              /* Byte size of array */
1744         int ilp32;              /* Convert data structures for ilp32 userland */
1745 
1746         *rvalp = 0;
1747 
1748         /*
1749          * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1750          * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1751          * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1752          *
1753          * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1754          */
1755         if (sv_mod_status == SV_ALLOW_UNLOAD) {
1756                 return (EBUSY);
1757         }
1758 
1759         if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1760                 return (rc);
1761 
1762         kstatus = spcs_s_kcreate();
1763         if (!kstatus) {
1764                 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1765                 return (ENOMEM);
1766         }
1767 
1768         ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1769 
1770         switch (cmd) {
1771 
1772         case SVIOC_ENABLE:
1773 
1774                 if (ilp32) {
1775                         sv_conf32_t svc32;
1776 
1777                         if (ddi_copyin((void *)arg, &svc32,
1778                             sizeof (svc32), mode) < 0) {
1779                                 spcs_s_kfree(kstatus);
1780                                 return (EFAULT);
1781                         }
1782 
1783                         svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1784                         (void) strcpy(svc.svc_path, svc32.svc_path);
1785                         svc.svc_flag  = svc32.svc_flag;
1786                         svc.svc_major = svc32.svc_major;
1787                         svc.svc_minor = svc32.svc_minor;
1788                 } else {
1789                         if (ddi_copyin((void *)arg, &svc,
1790                             sizeof (svc), mode) < 0) {
1791                                 spcs_s_kfree(kstatus);
1792                                 return (EFAULT);
1793                         }
1794                 }
1795 
1796                 /* force to raw access */
1797                 svc.svc_flag = NSC_DEVICE;
1798 
1799                 if (sv_tset == NULL) {
1800                         mutex_enter(&sv_mutex);
1801 
1802                         if (sv_tset == NULL) {
1803                                 sv_tset = nst_init("sv_thr", sv_threads);
1804                         }
1805 
1806                         mutex_exit(&sv_mutex);
1807 
1808                         if (sv_tset == NULL) {
1809                                 cmn_err(CE_WARN,
1810                                     "!sv: could not allocate %d threads",
1811                                     sv_threads);
1812                         }
1813                 }
1814 
1815                 rc = sv_enable(svc.svc_path, svc.svc_flag,
1816                     makedevice(svc.svc_major, svc.svc_minor), kstatus);
1817 
1818                 if (rc == 0) {
1819                         sv_config_time = nsc_lbolt();
1820 
1821                         mutex_enter(&sv_mutex);
1822                         sv_thread_tune(sv_threads_dev);
1823                         mutex_exit(&sv_mutex);
1824                 }
1825 
1826                 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1827 
1828                 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1829                 /* NOTREACHED */
1830 
1831         case SVIOC_DISABLE:
1832 
1833                 if (ilp32) {
1834                         sv_conf32_t svc32;
1835 
1836                         if (ddi_copyin((void *)arg, &svc32,
1837                             sizeof (svc32), mode) < 0) {
1838                                 spcs_s_kfree(kstatus);
1839                                 return (EFAULT);
1840                         }
1841 
1842                         svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1843                         svc.svc_major = svc32.svc_major;
1844                         svc.svc_minor = svc32.svc_minor;
1845                         (void) strcpy(svc.svc_path, svc32.svc_path);
1846                         svc.svc_flag  = svc32.svc_flag;
1847                 } else {
1848                         if (ddi_copyin((void *)arg, &svc,
1849                             sizeof (svc), mode) < 0) {
1850                                 spcs_s_kfree(kstatus);
1851                                 return (EFAULT);
1852                         }
1853                 }
1854 
1855                 if (svc.svc_major == (major_t)-1 &&
1856                     svc.svc_minor == (minor_t)-1) {
1857                         sv_dev_t *svp;
1858                         int i;
1859 
1860                         /*
1861                          * User level could not find the minor device
1862                          * node, so do this the slow way by searching
1863                          * the entire sv config for a matching pathname.
1864                          */
1865 
1866                         phash = nsc_strhash(svc.svc_path);
1867 
1868                         mutex_enter(&sv_mutex);
1869 
1870                         for (i = 0; i < sv_max_devices; i++) {
1871                                 svp = &sv_devs[i];
1872 
1873                                 if (svp->sv_state == SV_DISABLE ||
1874                                     svp->sv_fd == NULL)
1875                                         continue;
1876 
1877                                 if (nsc_fdpathcmp(svp->sv_fd, phash,
1878                                     svc.svc_path) == 0) {
1879                                         svc.svc_major = getmajor(svp->sv_dev);
1880                                         svc.svc_minor = getminor(svp->sv_dev);
1881                                         break;
1882                                 }
1883                         }
1884 
1885                         mutex_exit(&sv_mutex);
1886 
1887                         if (svc.svc_major == (major_t)-1 &&
1888                             svc.svc_minor == (minor_t)-1)
1889                                 return (spcs_s_ocopyoutf(&kstatus,
1890                                     svc.svc_error, SV_ENODEV));
1891                 }
1892 
1893                 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1894                     kstatus);
1895 
1896                 if (rc == 0) {
1897                         sv_config_time = nsc_lbolt();
1898 
1899                         mutex_enter(&sv_mutex);
1900                         sv_thread_tune(-sv_threads_dev);
1901                         mutex_exit(&sv_mutex);
1902                 }
1903 
1904                 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1905 
1906                 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1907                 /* NOTREACHED */
1908 
1909         case SVIOC_LIST:
1910 
1911                 if (ilp32) {
1912                         if (ddi_copyin((void *)arg, &svl32,
1913                             sizeof (svl32), mode) < 0) {
1914                                 spcs_s_kfree(kstatus);
1915                                 return (EFAULT);
1916                         }
1917 
1918                         ustatus = (spcs_s_info_t)svl32.svl_error;
1919                         size = svl32.svl_count;
1920                         usvn = (void *)(unsigned long)svl32.svl_names;
1921                 } else {
1922                         if (ddi_copyin((void *)arg, &svl,
1923                             sizeof (svl), mode) < 0) {
1924                                 spcs_s_kfree(kstatus);
1925                                 return (EFAULT);
1926                         }
1927 
1928                         ustatus = svl.svl_error;
1929                         size = svl.svl_count;
1930                         usvn = svl.svl_names;
1931                 }
1932 
1933                 /* Do some boundary checking */
1934                 if ((size < 0) || (size > sv_max_devices)) {
1935                         /* Array size is out of range */
1936                         return (spcs_s_ocopyoutf(&kstatus, ustatus,
1937                             SV_EARRBOUNDS, "0",
1938                             spcs_s_inttostring(sv_max_devices, itmp1,
1939                             sizeof (itmp1), 0),
1940                             spcs_s_inttostring(size, itmp2,
1941                             sizeof (itmp2), 0)));
1942                 }
1943 
1944                 if (ilp32)
1945                         bytes = size * sizeof (sv_name32_t);
1946                 else
1947                         bytes = size * sizeof (sv_name_t);
1948 
1949                 /* Allocate memory for the array of structures */
1950                 if (bytes != 0) {
1951                         svn = kmem_zalloc(bytes, KM_SLEEP);
1952                         if (!svn) {
1953                                 return (spcs_s_ocopyoutf(&kstatus,
1954                                     ustatus, ENOMEM));
1955                         }
1956                 }
1957 
1958                 rc = sv_list(svn, size, rvalp, ilp32);
1959                 if (rc) {
1960                         if (svn != NULL)
1961                                 kmem_free(svn, bytes);
1962                         return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1963                 }
1964 
1965                 if (ilp32) {
1966                         svl32.svl_timestamp = (uint32_t)sv_config_time;
1967                         svl32.svl_maxdevs = (int32_t)sv_max_devices;
1968 
1969                         /* Return the list structure */
1970                         if (ddi_copyout(&svl32, (void *)arg,
1971                             sizeof (svl32), mode) < 0) {
1972                                 spcs_s_kfree(kstatus);
1973                                 if (svn != NULL)
1974                                         kmem_free(svn, bytes);
1975                                 return (EFAULT);
1976                         }
1977                 } else {
1978                         svl.svl_timestamp = sv_config_time;
1979                         svl.svl_maxdevs = sv_max_devices;
1980 
1981                         /* Return the list structure */
1982                         if (ddi_copyout(&svl, (void *)arg,
1983                             sizeof (svl), mode) < 0) {
1984                                 spcs_s_kfree(kstatus);
1985                                 if (svn != NULL)
1986                                         kmem_free(svn, bytes);
1987                                 return (EFAULT);
1988                         }
1989                 }
1990 
1991                 /* Return the array */
1992                 if (svn != NULL) {
1993                         if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1994                                 kmem_free(svn, bytes);
1995                                 spcs_s_kfree(kstatus);
1996                                 return (EFAULT);
1997                         }
1998                         kmem_free(svn, bytes);
1999                 }
2000 
2001                 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
2002 
2003                 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2004                 /* NOTREACHED */
2005 
2006         case SVIOC_VERSION:
2007 
2008                 if (ilp32) {
2009                         sv_version32_t svv32;
2010 
2011                         if (ddi_copyin((void *)arg, &svv32,
2012                             sizeof (svv32), mode) < 0) {
2013                                 spcs_s_kfree(kstatus);
2014                                 return (EFAULT);
2015                         }
2016 
2017                         svv32.svv_major_rev = sv_major_rev;
2018                         svv32.svv_minor_rev = sv_minor_rev;
2019                         svv32.svv_micro_rev = sv_micro_rev;
2020                         svv32.svv_baseline_rev = sv_baseline_rev;
2021 
2022                         if (ddi_copyout(&svv32, (void *)arg,
2023                             sizeof (svv32), mode) < 0) {
2024                                 spcs_s_kfree(kstatus);
2025                                 return (EFAULT);
2026                         }
2027 
2028                         ustatus = (spcs_s_info_t)svv32.svv_error;
2029                 } else {
2030                         if (ddi_copyin((void *)arg, &svv,
2031                             sizeof (svv), mode) < 0) {
2032                                 spcs_s_kfree(kstatus);
2033                                 return (EFAULT);
2034                         }
2035 
2036                         svv.svv_major_rev = sv_major_rev;
2037                         svv.svv_minor_rev = sv_minor_rev;
2038                         svv.svv_micro_rev = sv_micro_rev;
2039                         svv.svv_baseline_rev = sv_baseline_rev;
2040 
2041                         if (ddi_copyout(&svv, (void *)arg,
2042                             sizeof (svv), mode) < 0) {
2043                                 spcs_s_kfree(kstatus);
2044                                 return (EFAULT);
2045                         }
2046 
2047                         ustatus = svv.svv_error;
2048                 }
2049 
2050                 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2051 
2052                 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2053                 /* NOTREACHED */
2054 
2055         case SVIOC_UNLOAD:
2056                 rc = sv_prepare_unload();
2057 
2058                 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2059                         rc = EFAULT;
2060                 }
2061 
2062                 spcs_s_kfree(kstatus);
2063                 return (rc);
2064 
2065         default:
2066                 spcs_s_kfree(kstatus);
2067 
2068                 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2069 
2070                 return (EINVAL);
2071                 /* NOTREACHED */
2072         }
2073 
2074         /* NOTREACHED */
2075 }
2076 
2077 
2078 /* ARGSUSED */
2079 static int
2080 svprint(dev_t dev, char *str)
2081 {
2082         int instance = ddi_get_instance(sv_dip);
2083         cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2084         return (0);
2085 }
2086 
2087 
2088 static void
2089 _sv_lyr_strategy(struct buf *bp)
2090 {
2091         caddr_t buf_addr;               /* pointer to linear buffer in bp */
2092         nsc_buf_t *bufh = NULL;
2093         nsc_buf_t *hndl = NULL;
2094         sv_dev_t *svp;
2095         nsc_vec_t *v;
2096         sv_maj_t *maj;
2097         nsc_size_t fba_req, fba_len;    /* FBA lengths */
2098         nsc_off_t fba_off;              /* FBA offset */
2099         size_t tocopy, nbytes;          /* byte lengths */
2100         int rw, rc;                     /* flags and return codes */
2101         int (*fn)();
2102 
2103         rc = 0;
2104 
2105         if (sv_debug > 5)
2106                 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2107 
2108         svp = sv_find_enabled(bp->b_edev, &maj);
2109         if (svp == NULL) {
2110                 if (maj && (fn = maj->sm_strategy) != 0) {
2111                         if (!(maj->sm_flag & D_MP)) {
2112                                 UNSAFE_ENTER();
2113                                 rc = (*fn)(bp);
2114                                 UNSAFE_EXIT();
2115                         } else {
2116                                 rc = (*fn)(bp);
2117                         }
2118                         return;
2119                 } else {
2120                         bioerror(bp, ENODEV);
2121                         biodone(bp);
2122                         return;
2123                 }
2124         }
2125 
2126         ASSERT(RW_READ_HELD(&svp->sv_lock));
2127 
2128         if (svp->sv_flag == 0) {
2129                 /*
2130                  * guard access mode
2131                  * - prevent user level access to the device
2132                  */
2133                 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2134                 bioerror(bp, EPERM);
2135                 goto out;
2136         }
2137 
2138         if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2139                 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2140 
2141                 if (rc == EINTR)
2142                         cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2143                 bioerror(bp, rc);
2144                 goto out;
2145         }
2146 
2147         if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2148                 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2149 
2150                 if (bp->b_flags & B_READ) {
2151                         /* return EOF, not an error */
2152                         bp->b_resid = bp->b_bcount;
2153                         bioerror(bp, 0);
2154                 } else
2155                         bioerror(bp, EINVAL);
2156 
2157                 goto done;
2158         }
2159 
2160         /*
2161          * Preallocate a handle once per call to strategy.
2162          * If this fails, then the nsc_alloc_buf() will allocate
2163          * a temporary handle per allocation/free pair.
2164          */
2165 
2166         DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2167 
2168         bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2169 
2170         DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2171 
2172         if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2173                 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2174 
2175                 cmn_err(CE_WARN,
2176                     "!sv: allocated active handle (bufh %p, flags %x)",
2177                     (void *)bufh, bufh->sb_flag);
2178 
2179                 bioerror(bp, ENXIO);
2180                 goto done;
2181         }
2182 
2183         fba_req = FBA_LEN(bp->b_bcount);
2184         if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2185                 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2186 
2187         rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2188 
2189         bp_mapin(bp);
2190 
2191         bp->b_resid = bp->b_bcount;
2192         buf_addr = bp->b_un.b_addr;
2193         fba_off = 0;
2194 
2195         /*
2196          * fba_req  - requested size of transfer in FBAs after
2197          *              truncation to device extent, and allowing for
2198          *              possible non-FBA bounded final chunk.
2199          * fba_off  - offset of start of chunk from start of bp in FBAs.
2200          * fba_len  - size of this chunk in FBAs.
2201          */
2202 
2203 loop:
2204         fba_len = min(fba_req, svp->sv_maxfbas);
2205         hndl = bufh;
2206 
2207         DTRACE_PROBE4(sv_dbg_allocb_start,
2208             sv_dev_t *, svp,
2209             uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2210             uint64_t, (uint64_t)fba_len,
2211             int, rw);
2212 
2213         rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2214             fba_len, rw, &hndl);
2215 
2216         DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2217 
2218         if (rc > 0) {
2219                 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2220                 bioerror(bp, rc);
2221                 if (hndl != bufh)
2222                         (void) nsc_free_buf(hndl);
2223                 hndl = NULL;
2224                 goto done;
2225         }
2226 
2227         tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2228         v = hndl->sb_vec;
2229 
2230         if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2231                 /*
2232                  * Not overwriting all of the last FBA, so read in the
2233                  * old contents now before we overwrite it with the new
2234                  * data.
2235                  */
2236 
2237                 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2238                     uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2239 
2240                 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2241                 if (rc > 0) {
2242                         bioerror(bp, rc);
2243                         goto done;
2244                 }
2245 
2246                 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2247         }
2248 
2249         DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2250 
2251         while (tocopy > 0) {
2252                 nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2253 
2254                 if (bp->b_flags & B_READ)
2255                         (void) bcopy(v->sv_addr, buf_addr, nbytes);
2256                 else
2257                         (void) bcopy(buf_addr, v->sv_addr, nbytes);
2258 
2259                 bp->b_resid -= nbytes;
2260                 buf_addr += nbytes;
2261                 tocopy -= nbytes;
2262                 v++;
2263         }
2264 
2265         DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2266 
2267         if ((bp->b_flags & B_READ) == 0) {
2268                 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2269                     uint64_t, (uint64_t)hndl->sb_pos,
2270                     uint64_t, (uint64_t)hndl->sb_len);
2271 
2272                 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2273 
2274                 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2275 
2276                 if (rc > 0) {
2277                         bioerror(bp, rc);
2278                         goto done;
2279                 }
2280         }
2281 
2282         /*
2283          * Adjust FBA offset and requested (ie. remaining) length,
2284          * loop if more data to transfer.
2285          */
2286 
2287         fba_off += fba_len;
2288         fba_req -= fba_len;
2289 
2290         if (fba_req > 0) {
2291                 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2292 
2293                 rc = nsc_free_buf(hndl);
2294 
2295                 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2296 
2297                 if (rc > 0) {
2298                         DTRACE_PROBE1(sv_lyr_strategy_err_free,
2299                             struct buf *, bp);
2300                         bioerror(bp, rc);
2301                 }
2302 
2303                 hndl = NULL;
2304 
2305                 if (rc <= 0)
2306                         goto loop;
2307         }
2308 
2309 done:
2310         if (hndl != NULL) {
2311                 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2312 
2313                 rc = nsc_free_buf(hndl);
2314 
2315                 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2316 
2317                 if (rc > 0) {
2318                         DTRACE_PROBE1(sv_lyr_strategy_err_free,
2319                             struct buf *, bp);
2320                         bioerror(bp, rc);
2321                 }
2322 
2323                 hndl = NULL;
2324         }
2325 
2326         if (bufh)
2327                 (void) nsc_free_handle(bufh);
2328 
2329         DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2330 
2331         nsc_release(svp->sv_fd);
2332 
2333         DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2334 
2335 out:
2336         if (sv_debug > 5) {
2337                 cmn_err(CE_CONT,
2338                     "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2339                     (void *)bp, (void *)bufh, bp->b_error);
2340         }
2341 
2342         DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2343 
2344         rw_exit(&svp->sv_lock);
2345         biodone(bp);
2346 }
2347 
2348 
2349 static void
2350 sv_async_strategy(blind_t arg)
2351 {
2352         struct buf *bp = (struct buf *)arg;
2353         _sv_lyr_strategy(bp);
2354 }
2355 
2356 
2357 static int
2358 sv_lyr_strategy(struct buf *bp)
2359 {
2360         nsthread_t *tp;
2361         int nlive;
2362 
2363         /*
2364          * If B_ASYNC was part of the DDI we could use it as a hint to
2365          * not create a thread for synchronous i/o.
2366          */
2367         if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2368                 /* not sv enabled - just pass through */
2369                 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2370                 _sv_lyr_strategy(bp);
2371                 return (0);
2372         }
2373 
2374         if (sv_debug > 4) {
2375                 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2376                     nst_nthread(sv_tset), nst_nlive(sv_tset));
2377         }
2378 
2379         /*
2380          * If there are only guard devices enabled there
2381          * won't be a threadset, so don't try and use it.
2382          */
2383         tp = NULL;
2384         if (sv_tset != NULL) {
2385                 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2386         }
2387 
2388         if (tp == NULL) {
2389                 /*
2390                  * out of threads, so fall back to synchronous io.
2391                  */
2392                 if (sv_debug > 0) {
2393                         cmn_err(CE_CONT,
2394                             "!sv_lyr_strategy: thread alloc failed\n");
2395                 }
2396 
2397                 DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2398                     struct buf *, bp);
2399 
2400                 _sv_lyr_strategy(bp);
2401                 sv_no_threads++;
2402         } else {
2403                 nlive = nst_nlive(sv_tset);
2404                 if (nlive > sv_max_nlive) {
2405                         if (sv_debug > 0) {
2406                                 cmn_err(CE_CONT,
2407                                     "!sv_lyr_strategy: "
2408                                     "new max nlive %d (nthread %d)\n",
2409                                     nlive, nst_nthread(sv_tset));
2410                         }
2411 
2412                         sv_max_nlive = nlive;
2413                 }
2414         }
2415 
2416         return (0);
2417 }
2418 
2419 /*
2420  * re-write the size of the current partition
2421  */
2422 static int
2423 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2424 {
2425         size_t offset;
2426         int ilp32;
2427         int pnum;
2428         int rc;
2429 
2430         ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2431 
2432         rc = nskern_partition(svp->sv_dev, &pnum);
2433         if (rc != 0) {
2434                 return (rc);
2435         }
2436 
2437         if (pnum < 0 || pnum >= V_NUMPAR) {
2438                 cmn_err(CE_WARN,
2439                     "!sv_gvtoc: unable to determine partition number "
2440                     "for dev %lx", svp->sv_dev);
2441                 return (EINVAL);
2442         }
2443 
2444         if (ilp32) {
2445                 int32_t p_size;
2446 
2447 #ifdef _SunOS_5_6
2448                 offset = offsetof(struct vtoc, v_part);
2449                 offset += sizeof (struct partition) * pnum;
2450                 offset += offsetof(struct partition, p_size);
2451 #else
2452                 offset = offsetof(struct vtoc32, v_part);
2453                 offset += sizeof (struct partition32) * pnum;
2454                 offset += offsetof(struct partition32, p_size);
2455 #endif
2456 
2457                 p_size = (int32_t)svp->sv_nblocks;
2458                 if (p_size == 0) {
2459                         if (sv_reserve(svp->sv_fd,
2460                             NSC_MULTI|NSC_PCATCH) == 0) {
2461                                 p_size = (int32_t)svp->sv_nblocks;
2462                                 nsc_release(svp->sv_fd);
2463                         } else {
2464                                 rc = EINTR;
2465                         }
2466                 }
2467 
2468                 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2469                     sizeof (p_size), mode) != 0) {
2470                         rc = EFAULT;
2471                 }
2472         } else {
2473                 long p_size;
2474 
2475                 offset = offsetof(struct vtoc, v_part);
2476                 offset += sizeof (struct partition) * pnum;
2477                 offset += offsetof(struct partition, p_size);
2478 
2479                 p_size = (long)svp->sv_nblocks;
2480                 if (p_size == 0) {
2481                         if (sv_reserve(svp->sv_fd,
2482                             NSC_MULTI|NSC_PCATCH) == 0) {
2483                                 p_size = (long)svp->sv_nblocks;
2484                                 nsc_release(svp->sv_fd);
2485                         } else {
2486                                 rc = EINTR;
2487                         }
2488                 }
2489 
2490                 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2491                     sizeof (p_size), mode) != 0) {
2492                         rc = EFAULT;
2493                 }
2494         }
2495 
2496         return (rc);
2497 }
2498 
2499 
2500 #ifdef DKIOCPARTITION
2501 /*
2502  * re-write the size of the current partition
2503  *
2504  * arg is dk_efi_t.
2505  *
2506  * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2507  *
2508  * dk_efi_t->dki_data --> efi_gpt_t (label header)
2509  * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2510  *
2511  * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2512  * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2513  *
2514  * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2515  * logical block on the disk.
2516  *
2517  * Everything is little endian (i.e. disk format).
2518  */
2519 static int
2520 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2521 {
2522         dk_efi_t efi;
2523         efi_gpt_t gpt;
2524         efi_gpe_t *gpe = NULL;
2525         size_t sgpe;
2526         uint64_t p_size;        /* virtual partition size from nsctl */
2527         uint32_t crc;
2528         int unparts;            /* number of parts in user's array */
2529         int pnum;
2530         int rc;
2531 
2532         rc = nskern_partition(svp->sv_dev, &pnum);
2533         if (rc != 0) {
2534                 return (rc);
2535         }
2536 
2537         if (pnum < 0) {
2538                 cmn_err(CE_WARN,
2539                     "!sv_efi: unable to determine partition number for dev %lx",
2540                     svp->sv_dev);
2541                 return (EINVAL);
2542         }
2543 
2544         if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2545                 return (EFAULT);
2546         }
2547 
2548         efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2549 
2550         if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2551                 return (EINVAL);
2552         }
2553 
2554         if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2555                 rc = EFAULT;
2556                 goto out;
2557         }
2558 
2559         if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2560                 unparts = 1;
2561         else if (pnum >= unparts) {
2562                 cmn_err(CE_WARN,
2563                     "!sv_efi: partition# beyond end of user array (%d >= %d)",
2564                     pnum, unparts);
2565                 return (EINVAL);
2566         }
2567 
2568         sgpe = sizeof (*gpe) * unparts;
2569         gpe = kmem_alloc(sgpe, KM_SLEEP);
2570 
2571         if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2572                 rc = EFAULT;
2573                 goto out;
2574         }
2575 
2576         p_size = svp->sv_nblocks;
2577         if (p_size == 0) {
2578                 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2579                         p_size = (diskaddr_t)svp->sv_nblocks;
2580                         nsc_release(svp->sv_fd);
2581                 } else {
2582                         rc = EINTR;
2583                 }
2584         }
2585 
2586         gpe[pnum].efi_gpe_EndingLBA = LE_64(
2587             LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2588 
2589         gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2590         CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2591         gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2592 
2593         gpt.efi_gpt_HeaderCRC32 = 0;
2594         CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2595         gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2596 
2597         if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2598                 rc = EFAULT;
2599                 goto out;
2600         }
2601 
2602         if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2603                 rc = EFAULT;
2604                 goto out;
2605         }
2606 
2607 out:
2608         if (gpe) {
2609                 kmem_free(gpe, sgpe);
2610         }
2611 
2612         return (rc);
2613 }
2614 
2615 
2616 /*
2617  * Re-write the size of the partition specified by p_partno
2618  *
2619  * Note that if a DKIOCPARTITION is issued to an fd opened against a
2620  * non-sv'd device, but p_partno requests the size for a different
2621  * device that is sv'd, this function will *not* be called as sv is
2622  * not interposed on the original device (the fd).
2623  *
2624  * It would not be easy to change this as we cannot get the partition
2625  * number for the non-sv'd device, so cannot compute the dev_t of the
2626  * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2627  * its size from nsctl.
2628  *
2629  * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2630  */
2631 static int
2632 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2633 {
2634         struct partition64 p64;
2635         sv_dev_t *nsvp = NULL;
2636         diskaddr_t p_size;
2637         minor_t nminor;
2638         int pnum, rc;
2639         dev_t ndev;
2640 
2641         rc = nskern_partition(svp->sv_dev, &pnum);
2642         if (rc != 0) {
2643                 return (rc);
2644         }
2645 
2646         if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2647                 return (EFAULT);
2648         }
2649 
2650         if (p64.p_partno != pnum) {
2651                 /* switch to requested partition, not the current one */
2652                 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2653                 ndev = makedevice(getmajor(svp->sv_dev), nminor);
2654                 nsvp = sv_find_enabled(ndev, NULL);
2655                 if (nsvp == NULL) {
2656                         /* not sv device - just return */
2657                         return (0);
2658                 }
2659 
2660                 svp = nsvp;
2661         }
2662 
2663         p_size = svp->sv_nblocks;
2664         if (p_size == 0) {
2665                 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2666                         p_size = (diskaddr_t)svp->sv_nblocks;
2667                         nsc_release(svp->sv_fd);
2668                 } else {
2669                         rc = EINTR;
2670                 }
2671         }
2672 
2673         if (nsvp != NULL) {
2674                 rw_exit(&nsvp->sv_lock);
2675         }
2676 
2677         if ((rc == 0) && ddi_copyout(&p_size,
2678             (void *)(arg + offsetof(struct partition64, p_size)),
2679             sizeof (p_size), mode) != 0) {
2680                 return (EFAULT);
2681         }
2682 
2683         return (rc);
2684 }
2685 #endif /* DKIOCPARTITION */
2686 
2687 
2688 static int
2689 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2690     const int mode, cred_t *crp, int *rvalp)
2691 {
2692         sv_dev_t *svp;
2693         sv_maj_t *maj;
2694         int (*fn)();
2695         int rc = 0;
2696 
2697         maj = 0;
2698         fn = 0;
2699 
2700         /*
2701          * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2702          * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2703          * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2704          *
2705          * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2706          */
2707         if (sv_mod_status == SV_ALLOW_UNLOAD) {
2708                 return (EBUSY);
2709         }
2710 
2711         svp = sv_find_enabled(dev, &maj);
2712         if (svp != NULL) {
2713                 if (nskernd_isdaemon()) {
2714                         /*
2715                          * This is nskernd which always needs to see
2716                          * the underlying disk device accurately.
2717                          *
2718                          * So just pass the ioctl straight through
2719                          * to the underlying driver as though the device
2720                          * was not sv enabled.
2721                          */
2722                         DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2723                             dev_t, dev);
2724 
2725                         rw_exit(&svp->sv_lock);
2726                         svp = NULL;
2727                 } else {
2728                         ASSERT(RW_READ_HELD(&svp->sv_lock));
2729                 }
2730         }
2731 
2732         /*
2733          * We now have a locked and enabled SV device, or a non-SV device.
2734          */
2735 
2736         switch (cmd) {
2737                 /*
2738                  * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2739                  * and DKIOCSETEFI are intercepted and faked up as some
2740                  * i/o providers emulate volumes of a different size to
2741                  * the underlying volume.
2742                  *
2743                  * Setting the size by rewriting the vtoc is not permitted.
2744                  */
2745 
2746         case DKIOCSVTOC:
2747 #ifdef DKIOCPARTITION
2748         case DKIOCSETEFI:
2749 #endif
2750                 if (svp == NULL) {
2751                         /* not intercepted -- allow ioctl through */
2752                         break;
2753                 }
2754 
2755                 rw_exit(&svp->sv_lock);
2756 
2757                 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2758 
2759                 return (EPERM);
2760 
2761         default:
2762                 break;
2763         }
2764 
2765         /*
2766          * Pass through the real ioctl command.
2767          */
2768 
2769         if (maj && (fn = maj->sm_ioctl) != 0) {
2770                 if (!(maj->sm_flag & D_MP)) {
2771                         UNSAFE_ENTER();
2772                         rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2773                         UNSAFE_EXIT();
2774                 } else {
2775                         rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2776                 }
2777         } else {
2778                 rc = ENODEV;
2779         }
2780 
2781         /*
2782          * Bug 4755783
2783          * Fix up the size of the current partition to allow
2784          * for the virtual volume to be a different size to the
2785          * physical volume (e.g. for II compact dependent shadows).
2786          *
2787          * Note that this only attempts to fix up the current partition
2788          * - the one that the ioctl was issued against.  There could be
2789          * other sv'd partitions in the same vtoc, but we cannot tell
2790          * so we don't attempt to fix them up.
2791          */
2792 
2793         if (svp != NULL && rc == 0) {
2794                 switch (cmd) {
2795                 case DKIOCGVTOC:
2796                         rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2797                         break;
2798 
2799 #ifdef DKIOCPARTITION
2800                 case DKIOCGETEFI:
2801                         rc = sv_fix_dkiocgetefi(arg, mode, svp);
2802                         break;
2803 
2804                 case DKIOCPARTITION:
2805                         rc = sv_fix_dkiocpartition(arg, mode, svp);
2806                         break;
2807 #endif /* DKIOCPARTITION */
2808                 }
2809         }
2810 
2811         if (svp != NULL) {
2812                 rw_exit(&svp->sv_lock);
2813         }
2814 
2815         return (rc);
2816 }