1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 /*
  30  * Zones
  31  *
  32  *   A zone is a named collection of processes, namespace constraints,
  33  *   and other system resources which comprise a secure and manageable
  34  *   application containment facility.
  35  *
  36  *   Zones (represented by the reference counted zone_t) are tracked in
  37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38  *   (zoneid_t) are used to track zone association.  Zone IDs are
  39  *   dynamically generated when the zone is created; if a persistent
  40  *   identifier is needed (core files, accounting logs, audit trail,
  41  *   etc.), the zone name should be used.
  42  *
  43  *
  44  *   Global Zone:
  45  *
  46  *   The global zone (zoneid 0) is automatically associated with all
  47  *   system resources that have not been bound to a user-created zone.
  48  *   This means that even systems where zones are not in active use
  49  *   have a global zone, and all processes, mounts, etc. are
  50  *   associated with that zone.  The global zone is generally
  51  *   unconstrained in terms of privileges and access, though the usual
  52  *   credential and privilege based restrictions apply.
  53  *
  54  *
  55  *   Zone States:
  56  *
  57  *   The states in which a zone may be in and the transitions are as
  58  *   follows:
  59  *
  60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61  *   initialized zone is added to the list of active zones on the system but
  62  *   isn't accessible.
  63  *
  64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65  *   not yet completed. Not possible to enter the zone, but attributes can
  66  *   be retrieved.
  67  *
  68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70  *   executed.  A zone remains in this state until it transitions into
  71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72  *
  73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75  *   state.
  76  *
  77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78  *   successfully started init.   A zone remains in this state until
  79  *   zone_shutdown() is called.
  80  *
  81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82  *   killing all processes running in the zone. The zone remains
  83  *   in this state until there are no more user processes running in the zone.
  84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85  *   Since zone_shutdown() is restartable, it may be called successfully
  86  *   multiple times for the same zone_t.  Setting of the zone's state to
  87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88  *   the zone's status without worrying about it being a moving target.
  89  *
  90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91  *   are no more user processes in the zone.  The zone remains in this
  92  *   state until there are no more kernel threads associated with the
  93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94  *   fail.
  95  *
  96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98  *   join the zone or create kernel threads therein.
  99  *
 100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  *   return NULL from now on.
 103  *
 104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  *   processes or threads doing work on behalf of the zone.  The zone is
 106  *   removed from the list of active zones.  zone_destroy() returns, and
 107  *   the zone can be recreated.
 108  *
 109  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  *   callbacks are executed, and all memory associated with the zone is
 111  *   freed.
 112  *
 113  *   Threads can wait for the zone to enter a requested state by using
 114  *   zone_status_wait() or zone_status_timedwait() with the desired
 115  *   state passed in as an argument.  Zone state transitions are
 116  *   uni-directional; it is not possible to move back to an earlier state.
 117  *
 118  *
 119  *   Zone-Specific Data:
 120  *
 121  *   Subsystems needing to maintain zone-specific data can store that
 122  *   data using the ZSD mechanism.  This provides a zone-specific data
 123  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  *   to register callbacks to be invoked when a zone is created, shut
 126  *   down, or destroyed.  This can be used to initialize zone-specific
 127  *   data for new zones and to clean up when zones go away.
 128  *
 129  *
 130  *   Data Structures:
 131  *
 132  *   The per-zone structure (zone_t) is reference counted, and freed
 133  *   when all references are released.  zone_hold and zone_rele can be
 134  *   used to adjust the reference count.  In addition, reference counts
 135  *   associated with the cred_t structure are tracked separately using
 136  *   zone_cred_hold and zone_cred_rele.
 137  *
 138  *   Pointers to active zone_t's are stored in two hash tables; one
 139  *   for searching by id, the other for searching by name.  Lookups
 140  *   can be performed on either basis, using zone_find_by_id and
 141  *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  *   held, so zone_rele should be called when the pointer is no longer
 143  *   needed.  Zones can also be searched by path; zone_find_by_path
 144  *   returns the zone with which a path name is associated (global
 145  *   zone if the path is not within some other zone's file system
 146  *   hierarchy).  This currently requires iterating through each zone,
 147  *   so it is slower than an id or name search via a hash table.
 148  *
 149  *
 150  *   Locking:
 151  *
 152  *   zonehash_lock: This is a top-level global lock used to protect the
 153  *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  *       while this lock is held.
 155  *   zone_status_lock: This is a global lock protecting zone state.
 156  *       Zones cannot change state while this lock is held.  It also
 157  *       protects the list of kernel threads associated with a zone.
 158  *   zone_lock: This is a per-zone lock used to protect several fields of
 159  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  *       this lock means that the zone cannot go away.
 161  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-lwps rctl.
 163  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  *       currently just max_lofi
 167  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  *       list (a list of zones in the ZONE_IS_DEAD state).
 170  *
 171  *   Ordering requirements:
 172  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  *
 175  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  *
 179  *   Blocking memory allocations are permitted while holding any of the
 180  *   zone locks.
 181  *
 182  *
 183  *   System Call Interface:
 184  *
 185  *   The zone subsystem can be managed and queried from user level with
 186  *   the following system calls (all subcodes of the primary "zone"
 187  *   system call):
 188  *   - zone_create: creates a zone with selected attributes (name,
 189  *     root path, privileges, resource controls, ZFS datasets)
 190  *   - zone_enter: allows the current process to enter a zone
 191  *   - zone_getattr: reports attributes of a zone
 192  *   - zone_setattr: set attributes of a zone
 193  *   - zone_boot: set 'init' running for the zone
 194  *   - zone_list: lists all zones active in the system
 195  *   - zone_lookup: looks up zone id based on name
 196  *   - zone_shutdown: initiates shutdown process (see states above)
 197  *   - zone_destroy: completes shutdown process (see states above)
 198  *
 199  */
 200 
 201 #include <sys/priv_impl.h>
 202 #include <sys/cred.h>
 203 #include <c2/audit.h>
 204 #include <sys/debug.h>
 205 #include <sys/file.h>
 206 #include <sys/kmem.h>
 207 #include <sys/kstat.h>
 208 #include <sys/mutex.h>
 209 #include <sys/note.h>
 210 #include <sys/pathname.h>
 211 #include <sys/proc.h>
 212 #include <sys/project.h>
 213 #include <sys/sysevent.h>
 214 #include <sys/task.h>
 215 #include <sys/systm.h>
 216 #include <sys/types.h>
 217 #include <sys/utsname.h>
 218 #include <sys/vnode.h>
 219 #include <sys/vfs.h>
 220 #include <sys/systeminfo.h>
 221 #include <sys/policy.h>
 222 #include <sys/cred_impl.h>
 223 #include <sys/contract_impl.h>
 224 #include <sys/contract/process_impl.h>
 225 #include <sys/class.h>
 226 #include <sys/pool.h>
 227 #include <sys/pool_pset.h>
 228 #include <sys/pset.h>
 229 #include <sys/strlog.h>
 230 #include <sys/sysmacros.h>
 231 #include <sys/callb.h>
 232 #include <sys/vmparam.h>
 233 #include <sys/corectl.h>
 234 #include <sys/ipc_impl.h>
 235 #include <sys/klpd.h>
 236 
 237 #include <sys/door.h>
 238 #include <sys/cpuvar.h>
 239 #include <sys/sdt.h>
 240 
 241 #include <sys/uadmin.h>
 242 #include <sys/session.h>
 243 #include <sys/cmn_err.h>
 244 #include <sys/modhash.h>
 245 #include <sys/sunddi.h>
 246 #include <sys/nvpair.h>
 247 #include <sys/rctl.h>
 248 #include <sys/fss.h>
 249 #include <sys/brand.h>
 250 #include <sys/zone.h>
 251 #include <net/if.h>
 252 #include <sys/cpucaps.h>
 253 #include <vm/seg.h>
 254 #include <sys/mac.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285 
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298 
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304 
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316 
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321 
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329 
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332 
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335 
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354 
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_max_lofi;
 376 rctl_hndl_t rc_zone_cpu_cap;
 377 rctl_hndl_t rc_zone_nlwps;
 378 rctl_hndl_t rc_zone_nprocs;
 379 rctl_hndl_t rc_zone_shmmax;
 380 rctl_hndl_t rc_zone_shmmni;
 381 rctl_hndl_t rc_zone_semmni;
 382 rctl_hndl_t rc_zone_msgmni;
 383 
 384 const char * const zone_default_initname = "/sbin/init";
 385 static char * const zone_prefix = "/zone/";
 386 static int zone_shutdown(zoneid_t zoneid);
 387 static int zone_add_datalink(zoneid_t, datalink_id_t);
 388 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390 static int zone_set_network(zoneid_t, zone_net_data_t *);
 391 static int zone_get_network(zoneid_t, zone_net_data_t *);
 392 
 393 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394 
 395 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399     zone_key_t);
 400 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402     kmutex_t *);
 403 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404     kmutex_t *);
 405 
 406 /*
 407  * Bump this number when you alter the zone syscall interfaces; this is
 408  * because we need to have support for previous API versions in libc
 409  * to support patching; libc calls into the kernel to determine this number.
 410  *
 411  * Version 1 of the API is the version originally shipped with Solaris 10
 412  * Version 2 alters the zone_create system call in order to support more
 413  *     arguments by moving the args into a structure; and to do better
 414  *     error reporting when zone_create() fails.
 415  * Version 3 alters the zone_create system call in order to support the
 416  *     import of ZFS datasets to zones.
 417  * Version 4 alters the zone_create system call in order to support
 418  *     Trusted Extensions.
 419  * Version 5 alters the zone_boot system call, and converts its old
 420  *     bootargs parameter to be set by the zone_setattr API instead.
 421  * Version 6 adds the flag argument to zone_create.
 422  * Version 7 adds the requested zone_did to zone_create.
 423  */
 424 static const int ZONE_SYSCALL_API_VERSION = 7;
 425 
 426 /*
 427  * Certain filesystems (such as NFS and autofs) need to know which zone
 428  * the mount is being placed in.  Because of this, we need to be able to
 429  * ensure that a zone isn't in the process of being created/destroyed such
 430  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 431  * it gets added the list of mounted zones, it ends up on the wrong zone's
 432  * mount list. Since a zone can't reside on an NFS file system, we don't
 433  * have to worry about the zonepath itself.
 434  *
 435  * The following functions: block_mounts()/resume_mounts() and
 436  * mount_in_progress()/mount_completed() are used by zones and the VFS
 437  * layer (respectively) to synchronize zone state transitions and new
 438  * mounts within a zone. This syncronization is on a per-zone basis, so
 439  * activity for one zone will not interfere with activity for another zone.
 440  *
 441  * The semantics are like a reader-reader lock such that there may
 442  * either be multiple mounts (or zone state transitions, if that weren't
 443  * serialized by zonehash_lock) in progress at the same time, but not
 444  * both.
 445  *
 446  * We use cv's so the user can ctrl-C out of the operation if it's
 447  * taking too long.
 448  *
 449  * The semantics are such that there is unfair bias towards the
 450  * "current" operation.  This means that zone halt may starve if
 451  * there is a rapid succession of new mounts coming in to the zone.
 452  */
 453 /*
 454  * Prevent new mounts from progressing to the point of calling
 455  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 456  * them to complete.
 457  */
 458 static int
 459 block_mounts(zone_t *zp)
 460 {
 461         int retval = 0;
 462 
 463         /*
 464          * Since it may block for a long time, block_mounts() shouldn't be
 465          * called with zonehash_lock held.
 466          */
 467         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 468         mutex_enter(&zp->zone_mount_lock);
 469         while (zp->zone_mounts_in_progress > 0) {
 470                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 471                         goto signaled;
 472         }
 473         /*
 474          * A negative value of mounts_in_progress indicates that mounts
 475          * have been blocked by (-mounts_in_progress) different callers
 476          * (remotely possible if two threads enter zone_shutdown at the same
 477          * time).
 478          */
 479         zp->zone_mounts_in_progress--;
 480         retval = 1;
 481 signaled:
 482         mutex_exit(&zp->zone_mount_lock);
 483         return (retval);
 484 }
 485 
 486 /*
 487  * The VFS layer may progress with new mounts as far as we're concerned.
 488  * Allow them to progress if we were the last obstacle.
 489  */
 490 static void
 491 resume_mounts(zone_t *zp)
 492 {
 493         mutex_enter(&zp->zone_mount_lock);
 494         if (++zp->zone_mounts_in_progress == 0)
 495                 cv_broadcast(&zp->zone_mount_cv);
 496         mutex_exit(&zp->zone_mount_lock);
 497 }
 498 
 499 /*
 500  * The VFS layer is busy with a mount; this zone should wait until all
 501  * of its mounts are completed to progress.
 502  */
 503 void
 504 mount_in_progress(zone_t *zp)
 505 {
 506         mutex_enter(&zp->zone_mount_lock);
 507         while (zp->zone_mounts_in_progress < 0)
 508                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 509         zp->zone_mounts_in_progress++;
 510         mutex_exit(&zp->zone_mount_lock);
 511 }
 512 
 513 /*
 514  * VFS is done with one mount; wake up any waiting block_mounts()
 515  * callers if this is the last mount.
 516  */
 517 void
 518 mount_completed(zone_t *zp)
 519 {
 520         mutex_enter(&zp->zone_mount_lock);
 521         if (--zp->zone_mounts_in_progress == 0)
 522                 cv_broadcast(&zp->zone_mount_cv);
 523         mutex_exit(&zp->zone_mount_lock);
 524 }
 525 
 526 /*
 527  * ZSD routines.
 528  *
 529  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 530  * defined by the pthread_key_create() and related interfaces.
 531  *
 532  * Kernel subsystems may register one or more data items and/or
 533  * callbacks to be executed when a zone is created, shutdown, or
 534  * destroyed.
 535  *
 536  * Unlike the thread counterpart, destructor callbacks will be executed
 537  * even if the data pointer is NULL and/or there are no constructor
 538  * callbacks, so it is the responsibility of such callbacks to check for
 539  * NULL data values if necessary.
 540  *
 541  * The locking strategy and overall picture is as follows:
 542  *
 543  * When someone calls zone_key_create(), a template ZSD entry is added to the
 544  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 545  * holding that lock all the existing zones are marked as
 546  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 547  * zone_zsd list (protected by zone_lock). The global list is updated first
 548  * (under zone_key_lock) to make sure that newly created zones use the
 549  * most recent list of keys. Then under zonehash_lock we walk the zones
 550  * and mark them.  Similar locking is used in zone_key_delete().
 551  *
 552  * The actual create, shutdown, and destroy callbacks are done without
 553  * holding any lock. And zsd_flags are used to ensure that the operations
 554  * completed so that when zone_key_create (and zone_create) is done, as well as
 555  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 556  * are completed.
 557  *
 558  * When new zones are created constructor callbacks for all registered ZSD
 559  * entries will be called. That also uses the above two phases of marking
 560  * what needs to be done, and then running the callbacks without holding
 561  * any locks.
 562  *
 563  * The framework does not provide any locking around zone_getspecific() and
 564  * zone_setspecific() apart from that needed for internal consistency, so
 565  * callers interested in atomic "test-and-set" semantics will need to provide
 566  * their own locking.
 567  */
 568 
 569 /*
 570  * Helper function to find the zsd_entry associated with the key in the
 571  * given list.
 572  */
 573 static struct zsd_entry *
 574 zsd_find(list_t *l, zone_key_t key)
 575 {
 576         struct zsd_entry *zsd;
 577 
 578         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 579                 if (zsd->zsd_key == key) {
 580                         return (zsd);
 581                 }
 582         }
 583         return (NULL);
 584 }
 585 
 586 /*
 587  * Helper function to find the zsd_entry associated with the key in the
 588  * given list. Move it to the front of the list.
 589  */
 590 static struct zsd_entry *
 591 zsd_find_mru(list_t *l, zone_key_t key)
 592 {
 593         struct zsd_entry *zsd;
 594 
 595         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 596                 if (zsd->zsd_key == key) {
 597                         /*
 598                          * Move to head of list to keep list in MRU order.
 599                          */
 600                         if (zsd != list_head(l)) {
 601                                 list_remove(l, zsd);
 602                                 list_insert_head(l, zsd);
 603                         }
 604                         return (zsd);
 605                 }
 606         }
 607         return (NULL);
 608 }
 609 
 610 void
 611 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 612     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 613 {
 614         struct zsd_entry *zsdp;
 615         struct zsd_entry *t;
 616         struct zone *zone;
 617         zone_key_t  key;
 618 
 619         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 620         zsdp->zsd_data = NULL;
 621         zsdp->zsd_create = create;
 622         zsdp->zsd_shutdown = shutdown;
 623         zsdp->zsd_destroy = destroy;
 624 
 625         /*
 626          * Insert in global list of callbacks. Makes future zone creations
 627          * see it.
 628          */
 629         mutex_enter(&zsd_key_lock);
 630         key = zsdp->zsd_key = ++zsd_keyval;
 631         ASSERT(zsd_keyval != 0);
 632         list_insert_tail(&zsd_registered_keys, zsdp);
 633         mutex_exit(&zsd_key_lock);
 634 
 635         /*
 636          * Insert for all existing zones and mark them as needing
 637          * a create callback.
 638          */
 639         mutex_enter(&zonehash_lock);        /* stop the world */
 640         for (zone = list_head(&zone_active); zone != NULL;
 641             zone = list_next(&zone_active, zone)) {
 642                 zone_status_t status;
 643 
 644                 mutex_enter(&zone->zone_lock);
 645 
 646                 /* Skip zones that are on the way down or not yet up */
 647                 status = zone_status_get(zone);
 648                 if (status >= ZONE_IS_DOWN ||
 649                     status == ZONE_IS_UNINITIALIZED) {
 650                         mutex_exit(&zone->zone_lock);
 651                         continue;
 652                 }
 653 
 654                 t = zsd_find_mru(&zone->zone_zsd, key);
 655                 if (t != NULL) {
 656                         /*
 657                          * A zsd_configure already inserted it after
 658                          * we dropped zsd_key_lock above.
 659                          */
 660                         mutex_exit(&zone->zone_lock);
 661                         continue;
 662                 }
 663                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 664                 t->zsd_key = key;
 665                 t->zsd_create = create;
 666                 t->zsd_shutdown = shutdown;
 667                 t->zsd_destroy = destroy;
 668                 if (create != NULL) {
 669                         t->zsd_flags = ZSD_CREATE_NEEDED;
 670                         DTRACE_PROBE2(zsd__create__needed,
 671                             zone_t *, zone, zone_key_t, key);
 672                 }
 673                 list_insert_tail(&zone->zone_zsd, t);
 674                 mutex_exit(&zone->zone_lock);
 675         }
 676         mutex_exit(&zonehash_lock);
 677 
 678         if (create != NULL) {
 679                 /* Now call the create callback for this key */
 680                 zsd_apply_all_zones(zsd_apply_create, key);
 681         }
 682         /*
 683          * It is safe for consumers to use the key now, make it
 684          * globally visible. Specifically zone_getspecific() will
 685          * always successfully return the zone specific data associated
 686          * with the key.
 687          */
 688         *keyp = key;
 689 
 690 }
 691 
 692 /*
 693  * Function called when a module is being unloaded, or otherwise wishes
 694  * to unregister its ZSD key and callbacks.
 695  *
 696  * Remove from the global list and determine the functions that need to
 697  * be called under a global lock. Then call the functions without
 698  * holding any locks. Finally free up the zone_zsd entries. (The apply
 699  * functions need to access the zone_zsd entries to find zsd_data etc.)
 700  */
 701 int
 702 zone_key_delete(zone_key_t key)
 703 {
 704         struct zsd_entry *zsdp = NULL;
 705         zone_t *zone;
 706 
 707         mutex_enter(&zsd_key_lock);
 708         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 709         if (zsdp == NULL) {
 710                 mutex_exit(&zsd_key_lock);
 711                 return (-1);
 712         }
 713         list_remove(&zsd_registered_keys, zsdp);
 714         mutex_exit(&zsd_key_lock);
 715 
 716         mutex_enter(&zonehash_lock);
 717         for (zone = list_head(&zone_active); zone != NULL;
 718             zone = list_next(&zone_active, zone)) {
 719                 struct zsd_entry *del;
 720 
 721                 mutex_enter(&zone->zone_lock);
 722                 del = zsd_find_mru(&zone->zone_zsd, key);
 723                 if (del == NULL) {
 724                         /*
 725                          * Somebody else got here first e.g the zone going
 726                          * away.
 727                          */
 728                         mutex_exit(&zone->zone_lock);
 729                         continue;
 730                 }
 731                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 732                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 733                 if (del->zsd_shutdown != NULL &&
 734                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 735                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 736                         DTRACE_PROBE2(zsd__shutdown__needed,
 737                             zone_t *, zone, zone_key_t, key);
 738                 }
 739                 if (del->zsd_destroy != NULL &&
 740                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 741                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 742                         DTRACE_PROBE2(zsd__destroy__needed,
 743                             zone_t *, zone, zone_key_t, key);
 744                 }
 745                 mutex_exit(&zone->zone_lock);
 746         }
 747         mutex_exit(&zonehash_lock);
 748         kmem_free(zsdp, sizeof (*zsdp));
 749 
 750         /* Now call the shutdown and destroy callback for this key */
 751         zsd_apply_all_zones(zsd_apply_shutdown, key);
 752         zsd_apply_all_zones(zsd_apply_destroy, key);
 753 
 754         /* Now we can free up the zsdp structures in each zone */
 755         mutex_enter(&zonehash_lock);
 756         for (zone = list_head(&zone_active); zone != NULL;
 757             zone = list_next(&zone_active, zone)) {
 758                 struct zsd_entry *del;
 759 
 760                 mutex_enter(&zone->zone_lock);
 761                 del = zsd_find(&zone->zone_zsd, key);
 762                 if (del != NULL) {
 763                         list_remove(&zone->zone_zsd, del);
 764                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 765                         kmem_free(del, sizeof (*del));
 766                 }
 767                 mutex_exit(&zone->zone_lock);
 768         }
 769         mutex_exit(&zonehash_lock);
 770 
 771         return (0);
 772 }
 773 
 774 /*
 775  * ZSD counterpart of pthread_setspecific().
 776  *
 777  * Since all zsd callbacks, including those with no create function,
 778  * have an entry in zone_zsd, if the key is registered it is part of
 779  * the zone_zsd list.
 780  * Return an error if the key wasn't registerd.
 781  */
 782 int
 783 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 784 {
 785         struct zsd_entry *t;
 786 
 787         mutex_enter(&zone->zone_lock);
 788         t = zsd_find_mru(&zone->zone_zsd, key);
 789         if (t != NULL) {
 790                 /*
 791                  * Replace old value with new
 792                  */
 793                 t->zsd_data = (void *)data;
 794                 mutex_exit(&zone->zone_lock);
 795                 return (0);
 796         }
 797         mutex_exit(&zone->zone_lock);
 798         return (-1);
 799 }
 800 
 801 /*
 802  * ZSD counterpart of pthread_getspecific().
 803  */
 804 void *
 805 zone_getspecific(zone_key_t key, zone_t *zone)
 806 {
 807         struct zsd_entry *t;
 808         void *data;
 809 
 810         mutex_enter(&zone->zone_lock);
 811         t = zsd_find_mru(&zone->zone_zsd, key);
 812         data = (t == NULL ? NULL : t->zsd_data);
 813         mutex_exit(&zone->zone_lock);
 814         return (data);
 815 }
 816 
 817 /*
 818  * Function used to initialize a zone's list of ZSD callbacks and data
 819  * when the zone is being created.  The callbacks are initialized from
 820  * the template list (zsd_registered_keys). The constructor callback is
 821  * executed later (once the zone exists and with locks dropped).
 822  */
 823 static void
 824 zone_zsd_configure(zone_t *zone)
 825 {
 826         struct zsd_entry *zsdp;
 827         struct zsd_entry *t;
 828 
 829         ASSERT(MUTEX_HELD(&zonehash_lock));
 830         ASSERT(list_head(&zone->zone_zsd) == NULL);
 831         mutex_enter(&zone->zone_lock);
 832         mutex_enter(&zsd_key_lock);
 833         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 834             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 835                 /*
 836                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 837                  * should not have added anything to it.
 838                  */
 839                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 840 
 841                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 842                 t->zsd_key = zsdp->zsd_key;
 843                 t->zsd_create = zsdp->zsd_create;
 844                 t->zsd_shutdown = zsdp->zsd_shutdown;
 845                 t->zsd_destroy = zsdp->zsd_destroy;
 846                 if (zsdp->zsd_create != NULL) {
 847                         t->zsd_flags = ZSD_CREATE_NEEDED;
 848                         DTRACE_PROBE2(zsd__create__needed,
 849                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 850                 }
 851                 list_insert_tail(&zone->zone_zsd, t);
 852         }
 853         mutex_exit(&zsd_key_lock);
 854         mutex_exit(&zone->zone_lock);
 855 }
 856 
 857 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 858 
 859 /*
 860  * Helper function to execute shutdown or destructor callbacks.
 861  */
 862 static void
 863 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 864 {
 865         struct zsd_entry *t;
 866 
 867         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 868         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 869         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 870 
 871         /*
 872          * Run the callback solely based on what is registered for the zone
 873          * in zone_zsd. The global list can change independently of this
 874          * as keys are registered and unregistered and we don't register new
 875          * callbacks for a zone that is in the process of going away.
 876          */
 877         mutex_enter(&zone->zone_lock);
 878         for (t = list_head(&zone->zone_zsd); t != NULL;
 879             t = list_next(&zone->zone_zsd, t)) {
 880                 zone_key_t key = t->zsd_key;
 881 
 882                 /* Skip if no callbacks registered */
 883 
 884                 if (ct == ZSD_SHUTDOWN) {
 885                         if (t->zsd_shutdown != NULL &&
 886                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 887                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 888                                 DTRACE_PROBE2(zsd__shutdown__needed,
 889                                     zone_t *, zone, zone_key_t, key);
 890                         }
 891                 } else {
 892                         if (t->zsd_destroy != NULL &&
 893                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 894                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 895                                 DTRACE_PROBE2(zsd__destroy__needed,
 896                                     zone_t *, zone, zone_key_t, key);
 897                         }
 898                 }
 899         }
 900         mutex_exit(&zone->zone_lock);
 901 
 902         /* Now call the shutdown and destroy callback for this key */
 903         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 904         zsd_apply_all_keys(zsd_apply_destroy, zone);
 905 
 906 }
 907 
 908 /*
 909  * Called when the zone is going away; free ZSD-related memory, and
 910  * destroy the zone_zsd list.
 911  */
 912 static void
 913 zone_free_zsd(zone_t *zone)
 914 {
 915         struct zsd_entry *t, *next;
 916 
 917         /*
 918          * Free all the zsd_entry's we had on this zone.
 919          */
 920         mutex_enter(&zone->zone_lock);
 921         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 922                 next = list_next(&zone->zone_zsd, t);
 923                 list_remove(&zone->zone_zsd, t);
 924                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 925                 kmem_free(t, sizeof (*t));
 926         }
 927         list_destroy(&zone->zone_zsd);
 928         mutex_exit(&zone->zone_lock);
 929 
 930 }
 931 
 932 /*
 933  * Apply a function to all zones for particular key value.
 934  *
 935  * The applyfn has to drop zonehash_lock if it does some work, and
 936  * then reacquire it before it returns.
 937  * When the lock is dropped we don't follow list_next even
 938  * if it is possible to do so without any hazards. This is
 939  * because we want the design to allow for the list of zones
 940  * to change in any arbitrary way during the time the
 941  * lock was dropped.
 942  *
 943  * It is safe to restart the loop at list_head since the applyfn
 944  * changes the zsd_flags as it does work, so a subsequent
 945  * pass through will have no effect in applyfn, hence the loop will terminate
 946  * in at worst O(N^2).
 947  */
 948 static void
 949 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 950 {
 951         zone_t *zone;
 952 
 953         mutex_enter(&zonehash_lock);
 954         zone = list_head(&zone_active);
 955         while (zone != NULL) {
 956                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 957                         /* Lock dropped - restart at head */
 958                         zone = list_head(&zone_active);
 959                 } else {
 960                         zone = list_next(&zone_active, zone);
 961                 }
 962         }
 963         mutex_exit(&zonehash_lock);
 964 }
 965 
 966 /*
 967  * Apply a function to all keys for a particular zone.
 968  *
 969  * The applyfn has to drop zonehash_lock if it does some work, and
 970  * then reacquire it before it returns.
 971  * When the lock is dropped we don't follow list_next even
 972  * if it is possible to do so without any hazards. This is
 973  * because we want the design to allow for the list of zsd callbacks
 974  * to change in any arbitrary way during the time the
 975  * lock was dropped.
 976  *
 977  * It is safe to restart the loop at list_head since the applyfn
 978  * changes the zsd_flags as it does work, so a subsequent
 979  * pass through will have no effect in applyfn, hence the loop will terminate
 980  * in at worst O(N^2).
 981  */
 982 static void
 983 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 984 {
 985         struct zsd_entry *t;
 986 
 987         mutex_enter(&zone->zone_lock);
 988         t = list_head(&zone->zone_zsd);
 989         while (t != NULL) {
 990                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 991                         /* Lock dropped - restart at head */
 992                         t = list_head(&zone->zone_zsd);
 993                 } else {
 994                         t = list_next(&zone->zone_zsd, t);
 995                 }
 996         }
 997         mutex_exit(&zone->zone_lock);
 998 }
 999 
1000 /*
1001  * Call the create function for the zone and key if CREATE_NEEDED
1002  * is set.
1003  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1004  * we wait for that thread to complete so that we can ensure that
1005  * all the callbacks are done when we've looped over all zones/keys.
1006  *
1007  * When we call the create function, we drop the global held by the
1008  * caller, and return true to tell the caller it needs to re-evalute the
1009  * state.
1010  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1011  * remains held on exit.
1012  */
1013 static boolean_t
1014 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1015     zone_t *zone, zone_key_t key)
1016 {
1017         void *result;
1018         struct zsd_entry *t;
1019         boolean_t dropped;
1020 
1021         if (lockp != NULL) {
1022                 ASSERT(MUTEX_HELD(lockp));
1023         }
1024         if (zone_lock_held) {
1025                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1026         } else {
1027                 mutex_enter(&zone->zone_lock);
1028         }
1029 
1030         t = zsd_find(&zone->zone_zsd, key);
1031         if (t == NULL) {
1032                 /*
1033                  * Somebody else got here first e.g the zone going
1034                  * away.
1035                  */
1036                 if (!zone_lock_held)
1037                         mutex_exit(&zone->zone_lock);
1038                 return (B_FALSE);
1039         }
1040         dropped = B_FALSE;
1041         if (zsd_wait_for_inprogress(zone, t, lockp))
1042                 dropped = B_TRUE;
1043 
1044         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1045                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1046                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1047                 DTRACE_PROBE2(zsd__create__inprogress,
1048                     zone_t *, zone, zone_key_t, key);
1049                 mutex_exit(&zone->zone_lock);
1050                 if (lockp != NULL)
1051                         mutex_exit(lockp);
1052 
1053                 dropped = B_TRUE;
1054                 ASSERT(t->zsd_create != NULL);
1055                 DTRACE_PROBE2(zsd__create__start,
1056                     zone_t *, zone, zone_key_t, key);
1057 
1058                 result = (*t->zsd_create)(zone->zone_id);
1059 
1060                 DTRACE_PROBE2(zsd__create__end,
1061                     zone_t *, zone, voidn *, result);
1062 
1063                 ASSERT(result != NULL);
1064                 if (lockp != NULL)
1065                         mutex_enter(lockp);
1066                 mutex_enter(&zone->zone_lock);
1067                 t->zsd_data = result;
1068                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1069                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1070                 cv_broadcast(&t->zsd_cv);
1071                 DTRACE_PROBE2(zsd__create__completed,
1072                     zone_t *, zone, zone_key_t, key);
1073         }
1074         if (!zone_lock_held)
1075                 mutex_exit(&zone->zone_lock);
1076         return (dropped);
1077 }
1078 
1079 /*
1080  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1081  * is set.
1082  * If some other thread gets here first and sets *_INPROGRESS, then
1083  * we wait for that thread to complete so that we can ensure that
1084  * all the callbacks are done when we've looped over all zones/keys.
1085  *
1086  * When we call the shutdown function, we drop the global held by the
1087  * caller, and return true to tell the caller it needs to re-evalute the
1088  * state.
1089  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1090  * remains held on exit.
1091  */
1092 static boolean_t
1093 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1094     zone_t *zone, zone_key_t key)
1095 {
1096         struct zsd_entry *t;
1097         void *data;
1098         boolean_t dropped;
1099 
1100         if (lockp != NULL) {
1101                 ASSERT(MUTEX_HELD(lockp));
1102         }
1103         if (zone_lock_held) {
1104                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1105         } else {
1106                 mutex_enter(&zone->zone_lock);
1107         }
1108 
1109         t = zsd_find(&zone->zone_zsd, key);
1110         if (t == NULL) {
1111                 /*
1112                  * Somebody else got here first e.g the zone going
1113                  * away.
1114                  */
1115                 if (!zone_lock_held)
1116                         mutex_exit(&zone->zone_lock);
1117                 return (B_FALSE);
1118         }
1119         dropped = B_FALSE;
1120         if (zsd_wait_for_creator(zone, t, lockp))
1121                 dropped = B_TRUE;
1122 
1123         if (zsd_wait_for_inprogress(zone, t, lockp))
1124                 dropped = B_TRUE;
1125 
1126         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1127                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1128                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1129                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1130                     zone_t *, zone, zone_key_t, key);
1131                 mutex_exit(&zone->zone_lock);
1132                 if (lockp != NULL)
1133                         mutex_exit(lockp);
1134                 dropped = B_TRUE;
1135 
1136                 ASSERT(t->zsd_shutdown != NULL);
1137                 data = t->zsd_data;
1138 
1139                 DTRACE_PROBE2(zsd__shutdown__start,
1140                     zone_t *, zone, zone_key_t, key);
1141 
1142                 (t->zsd_shutdown)(zone->zone_id, data);
1143                 DTRACE_PROBE2(zsd__shutdown__end,
1144                     zone_t *, zone, zone_key_t, key);
1145 
1146                 if (lockp != NULL)
1147                         mutex_enter(lockp);
1148                 mutex_enter(&zone->zone_lock);
1149                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1150                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1151                 cv_broadcast(&t->zsd_cv);
1152                 DTRACE_PROBE2(zsd__shutdown__completed,
1153                     zone_t *, zone, zone_key_t, key);
1154         }
1155         if (!zone_lock_held)
1156                 mutex_exit(&zone->zone_lock);
1157         return (dropped);
1158 }
1159 
1160 /*
1161  * Call the destroy function for the zone and key if DESTROY_NEEDED
1162  * is set.
1163  * If some other thread gets here first and sets *_INPROGRESS, then
1164  * we wait for that thread to complete so that we can ensure that
1165  * all the callbacks are done when we've looped over all zones/keys.
1166  *
1167  * When we call the destroy function, we drop the global held by the
1168  * caller, and return true to tell the caller it needs to re-evalute the
1169  * state.
1170  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1171  * remains held on exit.
1172  */
1173 static boolean_t
1174 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1175     zone_t *zone, zone_key_t key)
1176 {
1177         struct zsd_entry *t;
1178         void *data;
1179         boolean_t dropped;
1180 
1181         if (lockp != NULL) {
1182                 ASSERT(MUTEX_HELD(lockp));
1183         }
1184         if (zone_lock_held) {
1185                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1186         } else {
1187                 mutex_enter(&zone->zone_lock);
1188         }
1189 
1190         t = zsd_find(&zone->zone_zsd, key);
1191         if (t == NULL) {
1192                 /*
1193                  * Somebody else got here first e.g the zone going
1194                  * away.
1195                  */
1196                 if (!zone_lock_held)
1197                         mutex_exit(&zone->zone_lock);
1198                 return (B_FALSE);
1199         }
1200         dropped = B_FALSE;
1201         if (zsd_wait_for_creator(zone, t, lockp))
1202                 dropped = B_TRUE;
1203 
1204         if (zsd_wait_for_inprogress(zone, t, lockp))
1205                 dropped = B_TRUE;
1206 
1207         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1208                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1209                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1210                 DTRACE_PROBE2(zsd__destroy__inprogress,
1211                     zone_t *, zone, zone_key_t, key);
1212                 mutex_exit(&zone->zone_lock);
1213                 if (lockp != NULL)
1214                         mutex_exit(lockp);
1215                 dropped = B_TRUE;
1216 
1217                 ASSERT(t->zsd_destroy != NULL);
1218                 data = t->zsd_data;
1219                 DTRACE_PROBE2(zsd__destroy__start,
1220                     zone_t *, zone, zone_key_t, key);
1221 
1222                 (t->zsd_destroy)(zone->zone_id, data);
1223                 DTRACE_PROBE2(zsd__destroy__end,
1224                     zone_t *, zone, zone_key_t, key);
1225 
1226                 if (lockp != NULL)
1227                         mutex_enter(lockp);
1228                 mutex_enter(&zone->zone_lock);
1229                 t->zsd_data = NULL;
1230                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1231                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1232                 cv_broadcast(&t->zsd_cv);
1233                 DTRACE_PROBE2(zsd__destroy__completed,
1234                     zone_t *, zone, zone_key_t, key);
1235         }
1236         if (!zone_lock_held)
1237                 mutex_exit(&zone->zone_lock);
1238         return (dropped);
1239 }
1240 
1241 /*
1242  * Wait for any CREATE_NEEDED flag to be cleared.
1243  * Returns true if lockp was temporarily dropped while waiting.
1244  */
1245 static boolean_t
1246 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1247 {
1248         boolean_t dropped = B_FALSE;
1249 
1250         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1251                 DTRACE_PROBE2(zsd__wait__for__creator,
1252                     zone_t *, zone, struct zsd_entry *, t);
1253                 if (lockp != NULL) {
1254                         dropped = B_TRUE;
1255                         mutex_exit(lockp);
1256                 }
1257                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1258                 if (lockp != NULL) {
1259                         /* First drop zone_lock to preserve order */
1260                         mutex_exit(&zone->zone_lock);
1261                         mutex_enter(lockp);
1262                         mutex_enter(&zone->zone_lock);
1263                 }
1264         }
1265         return (dropped);
1266 }
1267 
1268 /*
1269  * Wait for any INPROGRESS flag to be cleared.
1270  * Returns true if lockp was temporarily dropped while waiting.
1271  */
1272 static boolean_t
1273 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1274 {
1275         boolean_t dropped = B_FALSE;
1276 
1277         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1278                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1279                     zone_t *, zone, struct zsd_entry *, t);
1280                 if (lockp != NULL) {
1281                         dropped = B_TRUE;
1282                         mutex_exit(lockp);
1283                 }
1284                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1285                 if (lockp != NULL) {
1286                         /* First drop zone_lock to preserve order */
1287                         mutex_exit(&zone->zone_lock);
1288                         mutex_enter(lockp);
1289                         mutex_enter(&zone->zone_lock);
1290                 }
1291         }
1292         return (dropped);
1293 }
1294 
1295 /*
1296  * Frees memory associated with the zone dataset list.
1297  */
1298 static void
1299 zone_free_datasets(zone_t *zone)
1300 {
1301         zone_dataset_t *t, *next;
1302 
1303         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1304                 next = list_next(&zone->zone_datasets, t);
1305                 list_remove(&zone->zone_datasets, t);
1306                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1307                 kmem_free(t, sizeof (*t));
1308         }
1309         list_destroy(&zone->zone_datasets);
1310 }
1311 
1312 /*
1313  * zone.cpu-shares resource control support.
1314  */
1315 /*ARGSUSED*/
1316 static rctl_qty_t
1317 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1318 {
1319         ASSERT(MUTEX_HELD(&p->p_lock));
1320         return (p->p_zone->zone_shares);
1321 }
1322 
1323 /*ARGSUSED*/
1324 static int
1325 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1326     rctl_qty_t nv)
1327 {
1328         ASSERT(MUTEX_HELD(&p->p_lock));
1329         ASSERT(e->rcep_t == RCENTITY_ZONE);
1330         if (e->rcep_p.zone == NULL)
1331                 return (0);
1332 
1333         e->rcep_p.zone->zone_shares = nv;
1334         return (0);
1335 }
1336 
1337 static rctl_ops_t zone_cpu_shares_ops = {
1338         rcop_no_action,
1339         zone_cpu_shares_usage,
1340         zone_cpu_shares_set,
1341         rcop_no_test
1342 };
1343 
1344 /*
1345  * zone.cpu-cap resource control support.
1346  */
1347 /*ARGSUSED*/
1348 static rctl_qty_t
1349 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1350 {
1351         ASSERT(MUTEX_HELD(&p->p_lock));
1352         return (cpucaps_zone_get(p->p_zone));
1353 }
1354 
1355 /*ARGSUSED*/
1356 static int
1357 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1358     rctl_qty_t nv)
1359 {
1360         zone_t *zone = e->rcep_p.zone;
1361 
1362         ASSERT(MUTEX_HELD(&p->p_lock));
1363         ASSERT(e->rcep_t == RCENTITY_ZONE);
1364 
1365         if (zone == NULL)
1366                 return (0);
1367 
1368         /*
1369          * set cap to the new value.
1370          */
1371         return (cpucaps_zone_set(zone, nv));
1372 }
1373 
1374 static rctl_ops_t zone_cpu_cap_ops = {
1375         rcop_no_action,
1376         zone_cpu_cap_get,
1377         zone_cpu_cap_set,
1378         rcop_no_test
1379 };
1380 
1381 /*ARGSUSED*/
1382 static rctl_qty_t
1383 zone_lwps_usage(rctl_t *r, proc_t *p)
1384 {
1385         rctl_qty_t nlwps;
1386         zone_t *zone = p->p_zone;
1387 
1388         ASSERT(MUTEX_HELD(&p->p_lock));
1389 
1390         mutex_enter(&zone->zone_nlwps_lock);
1391         nlwps = zone->zone_nlwps;
1392         mutex_exit(&zone->zone_nlwps_lock);
1393 
1394         return (nlwps);
1395 }
1396 
1397 /*ARGSUSED*/
1398 static int
1399 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1400     rctl_qty_t incr, uint_t flags)
1401 {
1402         rctl_qty_t nlwps;
1403 
1404         ASSERT(MUTEX_HELD(&p->p_lock));
1405         ASSERT(e->rcep_t == RCENTITY_ZONE);
1406         if (e->rcep_p.zone == NULL)
1407                 return (0);
1408         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1409         nlwps = e->rcep_p.zone->zone_nlwps;
1410 
1411         if (nlwps + incr > rcntl->rcv_value)
1412                 return (1);
1413 
1414         return (0);
1415 }
1416 
1417 /*ARGSUSED*/
1418 static int
1419 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1420 {
1421         ASSERT(MUTEX_HELD(&p->p_lock));
1422         ASSERT(e->rcep_t == RCENTITY_ZONE);
1423         if (e->rcep_p.zone == NULL)
1424                 return (0);
1425         e->rcep_p.zone->zone_nlwps_ctl = nv;
1426         return (0);
1427 }
1428 
1429 static rctl_ops_t zone_lwps_ops = {
1430         rcop_no_action,
1431         zone_lwps_usage,
1432         zone_lwps_set,
1433         zone_lwps_test,
1434 };
1435 
1436 /*ARGSUSED*/
1437 static rctl_qty_t
1438 zone_procs_usage(rctl_t *r, proc_t *p)
1439 {
1440         rctl_qty_t nprocs;
1441         zone_t *zone = p->p_zone;
1442 
1443         ASSERT(MUTEX_HELD(&p->p_lock));
1444 
1445         mutex_enter(&zone->zone_nlwps_lock);
1446         nprocs = zone->zone_nprocs;
1447         mutex_exit(&zone->zone_nlwps_lock);
1448 
1449         return (nprocs);
1450 }
1451 
1452 /*ARGSUSED*/
1453 static int
1454 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1455     rctl_qty_t incr, uint_t flags)
1456 {
1457         rctl_qty_t nprocs;
1458 
1459         ASSERT(MUTEX_HELD(&p->p_lock));
1460         ASSERT(e->rcep_t == RCENTITY_ZONE);
1461         if (e->rcep_p.zone == NULL)
1462                 return (0);
1463         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1464         nprocs = e->rcep_p.zone->zone_nprocs;
1465 
1466         if (nprocs + incr > rcntl->rcv_value)
1467                 return (1);
1468 
1469         return (0);
1470 }
1471 
1472 /*ARGSUSED*/
1473 static int
1474 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1475 {
1476         ASSERT(MUTEX_HELD(&p->p_lock));
1477         ASSERT(e->rcep_t == RCENTITY_ZONE);
1478         if (e->rcep_p.zone == NULL)
1479                 return (0);
1480         e->rcep_p.zone->zone_nprocs_ctl = nv;
1481         return (0);
1482 }
1483 
1484 static rctl_ops_t zone_procs_ops = {
1485         rcop_no_action,
1486         zone_procs_usage,
1487         zone_procs_set,
1488         zone_procs_test,
1489 };
1490 
1491 /*ARGSUSED*/
1492 static rctl_qty_t
1493 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1494 {
1495         ASSERT(MUTEX_HELD(&p->p_lock));
1496         return (p->p_zone->zone_shmmax);
1497 }
1498 
1499 /*ARGSUSED*/
1500 static int
1501 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1502     rctl_qty_t incr, uint_t flags)
1503 {
1504         rctl_qty_t v;
1505         ASSERT(MUTEX_HELD(&p->p_lock));
1506         ASSERT(e->rcep_t == RCENTITY_ZONE);
1507         v = e->rcep_p.zone->zone_shmmax + incr;
1508         if (v > rval->rcv_value)
1509                 return (1);
1510         return (0);
1511 }
1512 
1513 static rctl_ops_t zone_shmmax_ops = {
1514         rcop_no_action,
1515         zone_shmmax_usage,
1516         rcop_no_set,
1517         zone_shmmax_test
1518 };
1519 
1520 /*ARGSUSED*/
1521 static rctl_qty_t
1522 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1523 {
1524         ASSERT(MUTEX_HELD(&p->p_lock));
1525         return (p->p_zone->zone_ipc.ipcq_shmmni);
1526 }
1527 
1528 /*ARGSUSED*/
1529 static int
1530 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1531     rctl_qty_t incr, uint_t flags)
1532 {
1533         rctl_qty_t v;
1534         ASSERT(MUTEX_HELD(&p->p_lock));
1535         ASSERT(e->rcep_t == RCENTITY_ZONE);
1536         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1537         if (v > rval->rcv_value)
1538                 return (1);
1539         return (0);
1540 }
1541 
1542 static rctl_ops_t zone_shmmni_ops = {
1543         rcop_no_action,
1544         zone_shmmni_usage,
1545         rcop_no_set,
1546         zone_shmmni_test
1547 };
1548 
1549 /*ARGSUSED*/
1550 static rctl_qty_t
1551 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1552 {
1553         ASSERT(MUTEX_HELD(&p->p_lock));
1554         return (p->p_zone->zone_ipc.ipcq_semmni);
1555 }
1556 
1557 /*ARGSUSED*/
1558 static int
1559 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1560     rctl_qty_t incr, uint_t flags)
1561 {
1562         rctl_qty_t v;
1563         ASSERT(MUTEX_HELD(&p->p_lock));
1564         ASSERT(e->rcep_t == RCENTITY_ZONE);
1565         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1566         if (v > rval->rcv_value)
1567                 return (1);
1568         return (0);
1569 }
1570 
1571 static rctl_ops_t zone_semmni_ops = {
1572         rcop_no_action,
1573         zone_semmni_usage,
1574         rcop_no_set,
1575         zone_semmni_test
1576 };
1577 
1578 /*ARGSUSED*/
1579 static rctl_qty_t
1580 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1581 {
1582         ASSERT(MUTEX_HELD(&p->p_lock));
1583         return (p->p_zone->zone_ipc.ipcq_msgmni);
1584 }
1585 
1586 /*ARGSUSED*/
1587 static int
1588 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1589     rctl_qty_t incr, uint_t flags)
1590 {
1591         rctl_qty_t v;
1592         ASSERT(MUTEX_HELD(&p->p_lock));
1593         ASSERT(e->rcep_t == RCENTITY_ZONE);
1594         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1595         if (v > rval->rcv_value)
1596                 return (1);
1597         return (0);
1598 }
1599 
1600 static rctl_ops_t zone_msgmni_ops = {
1601         rcop_no_action,
1602         zone_msgmni_usage,
1603         rcop_no_set,
1604         zone_msgmni_test
1605 };
1606 
1607 /*ARGSUSED*/
1608 static rctl_qty_t
1609 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1610 {
1611         rctl_qty_t q;
1612         ASSERT(MUTEX_HELD(&p->p_lock));
1613         mutex_enter(&p->p_zone->zone_mem_lock);
1614         q = p->p_zone->zone_locked_mem;
1615         mutex_exit(&p->p_zone->zone_mem_lock);
1616         return (q);
1617 }
1618 
1619 /*ARGSUSED*/
1620 static int
1621 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1622     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1623 {
1624         rctl_qty_t q;
1625         zone_t *z;
1626 
1627         z = e->rcep_p.zone;
1628         ASSERT(MUTEX_HELD(&p->p_lock));
1629         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1630         q = z->zone_locked_mem;
1631         if (q + incr > rcntl->rcv_value)
1632                 return (1);
1633         return (0);
1634 }
1635 
1636 /*ARGSUSED*/
1637 static int
1638 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1639     rctl_qty_t nv)
1640 {
1641         ASSERT(MUTEX_HELD(&p->p_lock));
1642         ASSERT(e->rcep_t == RCENTITY_ZONE);
1643         if (e->rcep_p.zone == NULL)
1644                 return (0);
1645         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1646         return (0);
1647 }
1648 
1649 static rctl_ops_t zone_locked_mem_ops = {
1650         rcop_no_action,
1651         zone_locked_mem_usage,
1652         zone_locked_mem_set,
1653         zone_locked_mem_test
1654 };
1655 
1656 /*ARGSUSED*/
1657 static rctl_qty_t
1658 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1659 {
1660         rctl_qty_t q;
1661         zone_t *z = p->p_zone;
1662 
1663         ASSERT(MUTEX_HELD(&p->p_lock));
1664         mutex_enter(&z->zone_mem_lock);
1665         q = z->zone_max_swap;
1666         mutex_exit(&z->zone_mem_lock);
1667         return (q);
1668 }
1669 
1670 /*ARGSUSED*/
1671 static int
1672 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1673     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1674 {
1675         rctl_qty_t q;
1676         zone_t *z;
1677 
1678         z = e->rcep_p.zone;
1679         ASSERT(MUTEX_HELD(&p->p_lock));
1680         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1681         q = z->zone_max_swap;
1682         if (q + incr > rcntl->rcv_value)
1683                 return (1);
1684         return (0);
1685 }
1686 
1687 /*ARGSUSED*/
1688 static int
1689 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1690     rctl_qty_t nv)
1691 {
1692         ASSERT(MUTEX_HELD(&p->p_lock));
1693         ASSERT(e->rcep_t == RCENTITY_ZONE);
1694         if (e->rcep_p.zone == NULL)
1695                 return (0);
1696         e->rcep_p.zone->zone_max_swap_ctl = nv;
1697         return (0);
1698 }
1699 
1700 static rctl_ops_t zone_max_swap_ops = {
1701         rcop_no_action,
1702         zone_max_swap_usage,
1703         zone_max_swap_set,
1704         zone_max_swap_test
1705 };
1706 
1707 /*ARGSUSED*/
1708 static rctl_qty_t
1709 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1710 {
1711         rctl_qty_t q;
1712         zone_t *z = p->p_zone;
1713 
1714         ASSERT(MUTEX_HELD(&p->p_lock));
1715         mutex_enter(&z->zone_rctl_lock);
1716         q = z->zone_max_lofi;
1717         mutex_exit(&z->zone_rctl_lock);
1718         return (q);
1719 }
1720 
1721 /*ARGSUSED*/
1722 static int
1723 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1724     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1725 {
1726         rctl_qty_t q;
1727         zone_t *z;
1728 
1729         z = e->rcep_p.zone;
1730         ASSERT(MUTEX_HELD(&p->p_lock));
1731         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1732         q = z->zone_max_lofi;
1733         if (q + incr > rcntl->rcv_value)
1734                 return (1);
1735         return (0);
1736 }
1737 
1738 /*ARGSUSED*/
1739 static int
1740 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1741     rctl_qty_t nv)
1742 {
1743         ASSERT(MUTEX_HELD(&p->p_lock));
1744         ASSERT(e->rcep_t == RCENTITY_ZONE);
1745         if (e->rcep_p.zone == NULL)
1746                 return (0);
1747         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1748         return (0);
1749 }
1750 
1751 static rctl_ops_t zone_max_lofi_ops = {
1752         rcop_no_action,
1753         zone_max_lofi_usage,
1754         zone_max_lofi_set,
1755         zone_max_lofi_test
1756 };
1757 
1758 /*
1759  * Helper function to brand the zone with a unique ID.
1760  */
1761 static void
1762 zone_uniqid(zone_t *zone)
1763 {
1764         static uint64_t uniqid = 0;
1765 
1766         ASSERT(MUTEX_HELD(&zonehash_lock));
1767         zone->zone_uniqid = uniqid++;
1768 }
1769 
1770 /*
1771  * Returns a held pointer to the "kcred" for the specified zone.
1772  */
1773 struct cred *
1774 zone_get_kcred(zoneid_t zoneid)
1775 {
1776         zone_t *zone;
1777         cred_t *cr;
1778 
1779         if ((zone = zone_find_by_id(zoneid)) == NULL)
1780                 return (NULL);
1781         cr = zone->zone_kcred;
1782         crhold(cr);
1783         zone_rele(zone);
1784         return (cr);
1785 }
1786 
1787 static int
1788 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1789 {
1790         zone_t *zone = ksp->ks_private;
1791         zone_kstat_t *zk = ksp->ks_data;
1792 
1793         if (rw == KSTAT_WRITE)
1794                 return (EACCES);
1795 
1796         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1797         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1798         return (0);
1799 }
1800 
1801 static int
1802 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1803 {
1804         zone_t *zone = ksp->ks_private;
1805         zone_kstat_t *zk = ksp->ks_data;
1806 
1807         if (rw == KSTAT_WRITE)
1808                 return (EACCES);
1809 
1810         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1811         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1812         return (0);
1813 }
1814 
1815 static int
1816 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1817 {
1818         zone_t *zone = ksp->ks_private;
1819         zone_kstat_t *zk = ksp->ks_data;
1820 
1821         if (rw == KSTAT_WRITE)
1822                 return (EACCES);
1823 
1824         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1825         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1826         return (0);
1827 }
1828 
1829 static kstat_t *
1830 zone_kstat_create_common(zone_t *zone, char *name,
1831     int (*updatefunc) (kstat_t *, int))
1832 {
1833         kstat_t *ksp;
1834         zone_kstat_t *zk;
1835 
1836         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1837             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1838             KSTAT_FLAG_VIRTUAL);
1839 
1840         if (ksp == NULL)
1841                 return (NULL);
1842 
1843         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1844         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1845         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1846         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1847         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1848         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1849         ksp->ks_update = updatefunc;
1850         ksp->ks_private = zone;
1851         kstat_install(ksp);
1852         return (ksp);
1853 }
1854 
1855 
1856 static int
1857 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1858 {
1859         zone_t *zone = ksp->ks_private;
1860         zone_mcap_kstat_t *zmp = ksp->ks_data;
1861 
1862         if (rw == KSTAT_WRITE)
1863                 return (EACCES);
1864 
1865         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1866         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1867         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1868         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1869         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1870 
1871         return (0);
1872 }
1873 
1874 static kstat_t *
1875 zone_mcap_kstat_create(zone_t *zone)
1876 {
1877         kstat_t *ksp;
1878         zone_mcap_kstat_t *zmp;
1879 
1880         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1881             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1882             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1883             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1884                 return (NULL);
1885 
1886         if (zone->zone_id != GLOBAL_ZONEID)
1887                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1888 
1889         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1890         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1891         ksp->ks_lock = &zone->zone_mcap_lock;
1892         zone->zone_mcap_stats = zmp;
1893 
1894         /* The kstat "name" field is not large enough for a full zonename */
1895         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1896         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1897         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1900         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1901         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1902             KSTAT_DATA_UINT64);
1903 
1904         ksp->ks_update = zone_mcap_kstat_update;
1905         ksp->ks_private = zone;
1906 
1907         kstat_install(ksp);
1908         return (ksp);
1909 }
1910 
1911 static int
1912 zone_misc_kstat_update(kstat_t *ksp, int rw)
1913 {
1914         zone_t *zone = ksp->ks_private;
1915         zone_misc_kstat_t *zmp = ksp->ks_data;
1916         hrtime_t hrtime;
1917         uint64_t tmp;
1918 
1919         if (rw == KSTAT_WRITE)
1920                 return (EACCES);
1921 
1922         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
1923         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1924         scalehrtime(&hrtime);
1925         zmp->zm_stime.value.ui64 = hrtime;
1926 
1927         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
1928         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1929         scalehrtime(&hrtime);
1930         zmp->zm_utime.value.ui64 = hrtime;
1931 
1932         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
1933         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1934         scalehrtime(&hrtime);
1935         zmp->zm_wtime.value.ui64 = hrtime;
1936 
1937         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1938         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1939         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1940 
1941         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1942         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1943         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1944         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1945 
1946         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1947 
1948         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1949         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1950 
1951         return (0);
1952 }
1953 
1954 static kstat_t *
1955 zone_misc_kstat_create(zone_t *zone)
1956 {
1957         kstat_t *ksp;
1958         zone_misc_kstat_t *zmp;
1959 
1960         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1961             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1962             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1963             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1964                 return (NULL);
1965 
1966         if (zone->zone_id != GLOBAL_ZONEID)
1967                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1968 
1969         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1970         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1971         ksp->ks_lock = &zone->zone_misc_lock;
1972         zone->zone_misc_stats = zmp;
1973 
1974         /* The kstat "name" field is not large enough for a full zonename */
1975         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1976         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1977         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1978         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1979         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1980         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1982         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1983             KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1985         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1986             KSTAT_DATA_UINT32);
1987         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1988         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1989         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1990             KSTAT_DATA_UINT32);
1991         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1992         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1993 
1994         ksp->ks_update = zone_misc_kstat_update;
1995         ksp->ks_private = zone;
1996 
1997         kstat_install(ksp);
1998         return (ksp);
1999 }
2000 
2001 static void
2002 zone_kstat_create(zone_t *zone)
2003 {
2004         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2005             "lockedmem", zone_lockedmem_kstat_update);
2006         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2007             "swapresv", zone_swapresv_kstat_update);
2008         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2009             "nprocs", zone_nprocs_kstat_update);
2010 
2011         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2012                 zone->zone_mcap_stats = kmem_zalloc(
2013                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2014         }
2015 
2016         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2017                 zone->zone_misc_stats = kmem_zalloc(
2018                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2019         }
2020 }
2021 
2022 static void
2023 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2024 {
2025         void *data;
2026 
2027         if (*pkstat != NULL) {
2028                 data = (*pkstat)->ks_data;
2029                 kstat_delete(*pkstat);
2030                 kmem_free(data, datasz);
2031                 *pkstat = NULL;
2032         }
2033 }
2034 
2035 static void
2036 zone_kstat_delete(zone_t *zone)
2037 {
2038         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2039             sizeof (zone_kstat_t));
2040         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2041             sizeof (zone_kstat_t));
2042         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2043             sizeof (zone_kstat_t));
2044         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2045             sizeof (zone_mcap_kstat_t));
2046         zone_kstat_delete_common(&zone->zone_misc_ksp,
2047             sizeof (zone_misc_kstat_t));
2048 }
2049 
2050 /*
2051  * Called very early on in boot to initialize the ZSD list so that
2052  * zone_key_create() can be called before zone_init().  It also initializes
2053  * portions of zone0 which may be used before zone_init() is called.  The
2054  * variable "global_zone" will be set when zone0 is fully initialized by
2055  * zone_init().
2056  */
2057 void
2058 zone_zsd_init(void)
2059 {
2060         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2061         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2062         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2063             offsetof(struct zsd_entry, zsd_linkage));
2064         list_create(&zone_active, sizeof (zone_t),
2065             offsetof(zone_t, zone_linkage));
2066         list_create(&zone_deathrow, sizeof (zone_t),
2067             offsetof(zone_t, zone_linkage));
2068 
2069         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2070         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2071         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2072         zone0.zone_shares = 1;
2073         zone0.zone_nlwps = 0;
2074         zone0.zone_nlwps_ctl = INT_MAX;
2075         zone0.zone_nprocs = 0;
2076         zone0.zone_nprocs_ctl = INT_MAX;
2077         zone0.zone_locked_mem = 0;
2078         zone0.zone_locked_mem_ctl = UINT64_MAX;
2079         ASSERT(zone0.zone_max_swap == 0);
2080         zone0.zone_max_swap_ctl = UINT64_MAX;
2081         zone0.zone_max_lofi = 0;
2082         zone0.zone_max_lofi_ctl = UINT64_MAX;
2083         zone0.zone_shmmax = 0;
2084         zone0.zone_ipc.ipcq_shmmni = 0;
2085         zone0.zone_ipc.ipcq_semmni = 0;
2086         zone0.zone_ipc.ipcq_msgmni = 0;
2087         zone0.zone_name = GLOBAL_ZONENAME;
2088         zone0.zone_nodename = utsname.nodename;
2089         zone0.zone_domain = srpc_domain;
2090         zone0.zone_hostid = HW_INVALID_HOSTID;
2091         zone0.zone_fs_allowed = NULL;
2092         psecflags_default(&zone0.zone_secflags);
2093         zone0.zone_ref = 1;
2094         zone0.zone_id = GLOBAL_ZONEID;
2095         zone0.zone_status = ZONE_IS_RUNNING;
2096         zone0.zone_rootpath = "/";
2097         zone0.zone_rootpathlen = 2;
2098         zone0.zone_psetid = ZONE_PS_INVAL;
2099         zone0.zone_ncpus = 0;
2100         zone0.zone_ncpus_online = 0;
2101         zone0.zone_proc_initpid = 1;
2102         zone0.zone_initname = initname;
2103         zone0.zone_lockedmem_kstat = NULL;
2104         zone0.zone_swapresv_kstat = NULL;
2105         zone0.zone_nprocs_kstat = NULL;
2106 
2107         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2108             offsetof(zone_ref_t, zref_linkage));
2109         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2110             offsetof(struct zsd_entry, zsd_linkage));
2111         list_insert_head(&zone_active, &zone0);
2112 
2113         /*
2114          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2115          * to anything meaningful.  It is assigned to be 'rootdir' in
2116          * vfs_mountroot().
2117          */
2118         zone0.zone_rootvp = NULL;
2119         zone0.zone_vfslist = NULL;
2120         zone0.zone_bootargs = initargs;
2121         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2122         /*
2123          * The global zone has all privileges
2124          */
2125         priv_fillset(zone0.zone_privset);
2126         /*
2127          * Add p0 to the global zone
2128          */
2129         zone0.zone_zsched = &p0;
2130         p0.p_zone = &zone0;
2131 }
2132 
2133 /*
2134  * Compute a hash value based on the contents of the label and the DOI.  The
2135  * hash algorithm is somewhat arbitrary, but is based on the observation that
2136  * humans will likely pick labels that differ by amounts that work out to be
2137  * multiples of the number of hash chains, and thus stirring in some primes
2138  * should help.
2139  */
2140 static uint_t
2141 hash_bylabel(void *hdata, mod_hash_key_t key)
2142 {
2143         const ts_label_t *lab = (ts_label_t *)key;
2144         const uint32_t *up, *ue;
2145         uint_t hash;
2146         int i;
2147 
2148         _NOTE(ARGUNUSED(hdata));
2149 
2150         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2151         /* we depend on alignment of label, but not representation */
2152         up = (const uint32_t *)&lab->tsl_label;
2153         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2154         i = 1;
2155         while (up < ue) {
2156                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2157                 hash += *up + (*up << ((i % 16) + 1));
2158                 up++;
2159                 i++;
2160         }
2161         return (hash);
2162 }
2163 
2164 /*
2165  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2166  * equal).  This may need to be changed if less than / greater than is ever
2167  * needed.
2168  */
2169 static int
2170 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2171 {
2172         ts_label_t *lab1 = (ts_label_t *)key1;
2173         ts_label_t *lab2 = (ts_label_t *)key2;
2174 
2175         return (label_equal(lab1, lab2) ? 0 : 1);
2176 }
2177 
2178 /*
2179  * Called by main() to initialize the zones framework.
2180  */
2181 void
2182 zone_init(void)
2183 {
2184         rctl_dict_entry_t *rde;
2185         rctl_val_t *dval;
2186         rctl_set_t *set;
2187         rctl_alloc_gp_t *gp;
2188         rctl_entity_p_t e;
2189         int res;
2190 
2191         ASSERT(curproc == &p0);
2192 
2193         /*
2194          * Create ID space for zone IDs.  ID 0 is reserved for the
2195          * global zone.
2196          */
2197         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2198 
2199         /*
2200          * Initialize generic zone resource controls, if any.
2201          */
2202         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2203             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2204             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2205             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2206 
2207         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2208             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2209             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2210             RCTL_GLOBAL_INFINITE,
2211             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2212 
2213         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2214             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2215             INT_MAX, INT_MAX, &zone_lwps_ops);
2216 
2217         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2218             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2219             INT_MAX, INT_MAX, &zone_procs_ops);
2220 
2221         /*
2222          * System V IPC resource controls
2223          */
2224         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2225             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2226             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2227 
2228         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2229             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2230             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2231 
2232         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2233             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2234             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2235 
2236         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2237             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2238             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2239 
2240         /*
2241          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2242          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2243          */
2244         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2245         bzero(dval, sizeof (rctl_val_t));
2246         dval->rcv_value = 1;
2247         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2248         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2249         dval->rcv_action_recip_pid = -1;
2250 
2251         rde = rctl_dict_lookup("zone.cpu-shares");
2252         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2253 
2254         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2255             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2256             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2257             &zone_locked_mem_ops);
2258 
2259         rc_zone_max_swap = rctl_register("zone.max-swap",
2260             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2261             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2262             &zone_max_swap_ops);
2263 
2264         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2265             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2266             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2267             &zone_max_lofi_ops);
2268 
2269         /*
2270          * Initialize the ``global zone''.
2271          */
2272         set = rctl_set_create();
2273         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2274         mutex_enter(&p0.p_lock);
2275         e.rcep_p.zone = &zone0;
2276         e.rcep_t = RCENTITY_ZONE;
2277         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2278             gp);
2279 
2280         zone0.zone_nlwps = p0.p_lwpcnt;
2281         zone0.zone_nprocs = 1;
2282         zone0.zone_ntasks = 1;
2283         mutex_exit(&p0.p_lock);
2284         zone0.zone_restart_init = B_TRUE;
2285         zone0.zone_brand = &native_brand;
2286         rctl_prealloc_destroy(gp);
2287         /*
2288          * pool_default hasn't been initialized yet, so we let pool_init()
2289          * take care of making sure the global zone is in the default pool.
2290          */
2291 
2292         /*
2293          * Initialize global zone kstats
2294          */
2295         zone_kstat_create(&zone0);
2296 
2297         /*
2298          * Initialize zone label.
2299          * mlp are initialized when tnzonecfg is loaded.
2300          */
2301         zone0.zone_slabel = l_admin_low;
2302         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2303         label_hold(l_admin_low);
2304 
2305         /*
2306          * Initialise the lock for the database structure used by mntfs.
2307          */
2308         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2309 
2310         zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2311 
2312         mutex_enter(&zonehash_lock);
2313         zone_uniqid(&zone0);
2314         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2315 
2316         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2317             mod_hash_null_valdtor);
2318         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2319             zone_hash_size, mod_hash_null_valdtor);
2320         /*
2321          * maintain zonehashbylabel only for labeled systems
2322          */
2323         if (is_system_labeled())
2324                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2325                     zone_hash_size, mod_hash_null_keydtor,
2326                     mod_hash_null_valdtor, hash_bylabel, NULL,
2327                     hash_labelkey_cmp, KM_SLEEP);
2328         zonecount = 1;
2329 
2330         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2331             (mod_hash_val_t)&zone0);
2332         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2333             (mod_hash_val_t)&zone0);
2334         if (is_system_labeled()) {
2335                 zone0.zone_flags |= ZF_HASHED_LABEL;
2336                 (void) mod_hash_insert(zonehashbylabel,
2337                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2338         }
2339         mutex_exit(&zonehash_lock);
2340 
2341         /*
2342          * We avoid setting zone_kcred until now, since kcred is initialized
2343          * sometime after zone_zsd_init() and before zone_init().
2344          */
2345         zone0.zone_kcred = kcred;
2346         /*
2347          * The global zone is fully initialized (except for zone_rootvp which
2348          * will be set when the root filesystem is mounted).
2349          */
2350         global_zone = &zone0;
2351 
2352         /*
2353          * Setup an event channel to send zone status change notifications on
2354          */
2355         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2356             EVCH_CREAT);
2357 
2358         if (res)
2359                 panic("Sysevent_evc_bind failed during zone setup.\n");
2360 
2361 }
2362 
2363 static void
2364 zone_free(zone_t *zone)
2365 {
2366         ASSERT(zone != global_zone);
2367         ASSERT(zone->zone_ntasks == 0);
2368         ASSERT(zone->zone_nlwps == 0);
2369         ASSERT(zone->zone_nprocs == 0);
2370         ASSERT(zone->zone_cred_ref == 0);
2371         ASSERT(zone->zone_kcred == NULL);
2372         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2373             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2374         ASSERT(list_is_empty(&zone->zone_ref_list));
2375 
2376         /*
2377          * Remove any zone caps.
2378          */
2379         cpucaps_zone_remove(zone);
2380 
2381         ASSERT(zone->zone_cpucap == NULL);
2382 
2383         /* remove from deathrow list */
2384         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2385                 ASSERT(zone->zone_ref == 0);
2386                 mutex_enter(&zone_deathrow_lock);
2387                 list_remove(&zone_deathrow, zone);
2388                 mutex_exit(&zone_deathrow_lock);
2389         }
2390 
2391         list_destroy(&zone->zone_ref_list);
2392         zone_free_zsd(zone);
2393         zone_free_datasets(zone);
2394         list_destroy(&zone->zone_dl_list);
2395 
2396         cpu_uarray_free(zone->zone_ustate);
2397 
2398         if (zone->zone_rootvp != NULL)
2399                 VN_RELE(zone->zone_rootvp);
2400         if (zone->zone_rootpath)
2401                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2402         if (zone->zone_name != NULL)
2403                 kmem_free(zone->zone_name, ZONENAME_MAX);
2404         if (zone->zone_slabel != NULL)
2405                 label_rele(zone->zone_slabel);
2406         if (zone->zone_nodename != NULL)
2407                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2408         if (zone->zone_domain != NULL)
2409                 kmem_free(zone->zone_domain, _SYS_NMLN);
2410         if (zone->zone_privset != NULL)
2411                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2412         if (zone->zone_rctls != NULL)
2413                 rctl_set_free(zone->zone_rctls);
2414         if (zone->zone_bootargs != NULL)
2415                 strfree(zone->zone_bootargs);
2416         if (zone->zone_initname != NULL)
2417                 strfree(zone->zone_initname);
2418         if (zone->zone_fs_allowed != NULL)
2419                 strfree(zone->zone_fs_allowed);
2420         if (zone->zone_pfexecd != NULL)
2421                 klpd_freelist(&zone->zone_pfexecd);
2422         id_free(zoneid_space, zone->zone_id);
2423         mutex_destroy(&zone->zone_lock);
2424         cv_destroy(&zone->zone_cv);
2425         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2426         rw_destroy(&zone->zone_mntfs_db_lock);
2427         kmem_free(zone, sizeof (zone_t));
2428 }
2429 
2430 /*
2431  * See block comment at the top of this file for information about zone
2432  * status values.
2433  */
2434 /*
2435  * Convenience function for setting zone status.
2436  */
2437 static void
2438 zone_status_set(zone_t *zone, zone_status_t status)
2439 {
2440 
2441         nvlist_t *nvl = NULL;
2442         ASSERT(MUTEX_HELD(&zone_status_lock));
2443         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2444             status >= zone_status_get(zone));
2445 
2446         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2447             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2448             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2449             zone_status_table[status]) ||
2450             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2451             zone_status_table[zone->zone_status]) ||
2452             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2453             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2454             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2455             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2456 #ifdef DEBUG
2457                 (void) printf(
2458                     "Failed to allocate and send zone state change event.\n");
2459 #endif
2460         }
2461         nvlist_free(nvl);
2462 
2463         zone->zone_status = status;
2464 
2465         cv_broadcast(&zone->zone_cv);
2466 }
2467 
2468 /*
2469  * Public function to retrieve the zone status.  The zone status may
2470  * change after it is retrieved.
2471  */
2472 zone_status_t
2473 zone_status_get(zone_t *zone)
2474 {
2475         return (zone->zone_status);
2476 }
2477 
2478 static int
2479 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2480 {
2481         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2482         int err = 0;
2483 
2484         ASSERT(zone != global_zone);
2485         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2486                 goto done;      /* EFAULT or ENAMETOOLONG */
2487 
2488         if (zone->zone_bootargs != NULL)
2489                 strfree(zone->zone_bootargs);
2490 
2491         zone->zone_bootargs = strdup(buf);
2492 
2493 done:
2494         kmem_free(buf, BOOTARGS_MAX);
2495         return (err);
2496 }
2497 
2498 static int
2499 zone_set_brand(zone_t *zone, const char *brand)
2500 {
2501         struct brand_attr *attrp;
2502         brand_t *bp;
2503 
2504         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2505         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2506                 kmem_free(attrp, sizeof (struct brand_attr));
2507                 return (EFAULT);
2508         }
2509 
2510         bp = brand_register_zone(attrp);
2511         kmem_free(attrp, sizeof (struct brand_attr));
2512         if (bp == NULL)
2513                 return (EINVAL);
2514 
2515         /*
2516          * This is the only place where a zone can change it's brand.
2517          * We already need to hold zone_status_lock to check the zone
2518          * status, so we'll just use that lock to serialize zone
2519          * branding requests as well.
2520          */
2521         mutex_enter(&zone_status_lock);
2522 
2523         /* Re-Branding is not allowed and the zone can't be booted yet */
2524         if ((ZONE_IS_BRANDED(zone)) ||
2525             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2526                 mutex_exit(&zone_status_lock);
2527                 brand_unregister_zone(bp);
2528                 return (EINVAL);
2529         }
2530 
2531         /* set up the brand specific data */
2532         zone->zone_brand = bp;
2533         ZBROP(zone)->b_init_brand_data(zone);
2534 
2535         mutex_exit(&zone_status_lock);
2536         return (0);
2537 }
2538 
2539 static int
2540 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2541 {
2542         int err = 0;
2543         psecflags_t psf;
2544 
2545         ASSERT(zone != global_zone);
2546 
2547         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2548                 return (err);
2549 
2550         if (zone_status_get(zone) > ZONE_IS_READY)
2551                 return (EINVAL);
2552 
2553         if (!psecflags_validate(&psf))
2554                 return (EINVAL);
2555 
2556         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2557 
2558         /* Set security flags on the zone's zsched */
2559         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2560             sizeof (zone->zone_zsched->p_secflags));
2561 
2562         return (0);
2563 }
2564 
2565 static int
2566 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2567 {
2568         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2569         int err = 0;
2570 
2571         ASSERT(zone != global_zone);
2572         if ((err = copyinstr(zone_fs_allowed, buf,
2573             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2574                 goto done;
2575 
2576         if (zone->zone_fs_allowed != NULL)
2577                 strfree(zone->zone_fs_allowed);
2578 
2579         zone->zone_fs_allowed = strdup(buf);
2580 
2581 done:
2582         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2583         return (err);
2584 }
2585 
2586 static int
2587 zone_set_initname(zone_t *zone, const char *zone_initname)
2588 {
2589         char initname[INITNAME_SZ];
2590         size_t len;
2591         int err = 0;
2592 
2593         ASSERT(zone != global_zone);
2594         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2595                 return (err);   /* EFAULT or ENAMETOOLONG */
2596 
2597         if (zone->zone_initname != NULL)
2598                 strfree(zone->zone_initname);
2599 
2600         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2601         (void) strcpy(zone->zone_initname, initname);
2602         return (0);
2603 }
2604 
2605 static int
2606 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2607 {
2608         uint64_t mcap;
2609         int err = 0;
2610 
2611         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2612                 zone->zone_phys_mcap = mcap;
2613 
2614         return (err);
2615 }
2616 
2617 static int
2618 zone_set_sched_class(zone_t *zone, const char *new_class)
2619 {
2620         char sched_class[PC_CLNMSZ];
2621         id_t classid;
2622         int err;
2623 
2624         ASSERT(zone != global_zone);
2625         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2626                 return (err);   /* EFAULT or ENAMETOOLONG */
2627 
2628         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2629                 return (set_errno(EINVAL));
2630         zone->zone_defaultcid = classid;
2631         ASSERT(zone->zone_defaultcid > 0 &&
2632             zone->zone_defaultcid < loaded_classes);
2633 
2634         return (0);
2635 }
2636 
2637 /*
2638  * Block indefinitely waiting for (zone_status >= status)
2639  */
2640 void
2641 zone_status_wait(zone_t *zone, zone_status_t status)
2642 {
2643         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2644 
2645         mutex_enter(&zone_status_lock);
2646         while (zone->zone_status < status) {
2647                 cv_wait(&zone->zone_cv, &zone_status_lock);
2648         }
2649         mutex_exit(&zone_status_lock);
2650 }
2651 
2652 /*
2653  * Private CPR-safe version of zone_status_wait().
2654  */
2655 static void
2656 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2657 {
2658         callb_cpr_t cprinfo;
2659 
2660         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2661 
2662         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2663             str);
2664         mutex_enter(&zone_status_lock);
2665         while (zone->zone_status < status) {
2666                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2667                 cv_wait(&zone->zone_cv, &zone_status_lock);
2668                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2669         }
2670         /*
2671          * zone_status_lock is implicitly released by the following.
2672          */
2673         CALLB_CPR_EXIT(&cprinfo);
2674 }
2675 
2676 /*
2677  * Block until zone enters requested state or signal is received.  Return (0)
2678  * if signaled, non-zero otherwise.
2679  */
2680 int
2681 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2682 {
2683         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2684 
2685         mutex_enter(&zone_status_lock);
2686         while (zone->zone_status < status) {
2687                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2688                         mutex_exit(&zone_status_lock);
2689                         return (0);
2690                 }
2691         }
2692         mutex_exit(&zone_status_lock);
2693         return (1);
2694 }
2695 
2696 /*
2697  * Block until the zone enters the requested state or the timeout expires,
2698  * whichever happens first.  Return (-1) if operation timed out, time remaining
2699  * otherwise.
2700  */
2701 clock_t
2702 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2703 {
2704         clock_t timeleft = 0;
2705 
2706         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2707 
2708         mutex_enter(&zone_status_lock);
2709         while (zone->zone_status < status && timeleft != -1) {
2710                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2711         }
2712         mutex_exit(&zone_status_lock);
2713         return (timeleft);
2714 }
2715 
2716 /*
2717  * Block until the zone enters the requested state, the current process is
2718  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2719  * operation timed out, 0 if signaled, time remaining otherwise.
2720  */
2721 clock_t
2722 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2723 {
2724         clock_t timeleft = tim - ddi_get_lbolt();
2725 
2726         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2727 
2728         mutex_enter(&zone_status_lock);
2729         while (zone->zone_status < status) {
2730                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2731                     tim);
2732                 if (timeleft <= 0)
2733                         break;
2734         }
2735         mutex_exit(&zone_status_lock);
2736         return (timeleft);
2737 }
2738 
2739 /*
2740  * Zones have two reference counts: one for references from credential
2741  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2742  * This is so we can allow a zone to be rebooted while there are still
2743  * outstanding cred references, since certain drivers cache dblks (which
2744  * implicitly results in cached creds).  We wait for zone_ref to drop to
2745  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2746  * later freed when the zone_cred_ref drops to 0, though nothing other
2747  * than the zone id and privilege set should be accessed once the zone
2748  * is "dead".
2749  *
2750  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2751  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2752  * to 0.  This can be useful to flush out other sources of cached creds
2753  * that may be less innocuous than the driver case.
2754  *
2755  * Zones also provide a tracked reference counting mechanism in which zone
2756  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2757  * debuggers determine the sources of leaked zone references.  See
2758  * zone_hold_ref() and zone_rele_ref() below for more information.
2759  */
2760 
2761 int zone_wait_for_cred = 0;
2762 
2763 static void
2764 zone_hold_locked(zone_t *z)
2765 {
2766         ASSERT(MUTEX_HELD(&z->zone_lock));
2767         z->zone_ref++;
2768         ASSERT(z->zone_ref != 0);
2769 }
2770 
2771 /*
2772  * Increment the specified zone's reference count.  The zone's zone_t structure
2773  * will not be freed as long as the zone's reference count is nonzero.
2774  * Decrement the zone's reference count via zone_rele().
2775  *
2776  * NOTE: This function should only be used to hold zones for short periods of
2777  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2778  */
2779 void
2780 zone_hold(zone_t *z)
2781 {
2782         mutex_enter(&z->zone_lock);
2783         zone_hold_locked(z);
2784         mutex_exit(&z->zone_lock);
2785 }
2786 
2787 /*
2788  * If the non-cred ref count drops to 1 and either the cred ref count
2789  * is 0 or we aren't waiting for cred references, the zone is ready to
2790  * be destroyed.
2791  */
2792 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2793             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2794 
2795 /*
2796  * Common zone reference release function invoked by zone_rele() and
2797  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2798  * zone's subsystem-specific reference counters are not affected by the
2799  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2800  * removed from the specified zone's reference list.  ref must be non-NULL iff
2801  * subsys is not ZONE_REF_NUM_SUBSYS.
2802  */
2803 static void
2804 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2805 {
2806         boolean_t wakeup;
2807 
2808         mutex_enter(&z->zone_lock);
2809         ASSERT(z->zone_ref != 0);
2810         z->zone_ref--;
2811         if (subsys != ZONE_REF_NUM_SUBSYS) {
2812                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2813                 z->zone_subsys_ref[subsys]--;
2814                 list_remove(&z->zone_ref_list, ref);
2815         }
2816         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2817                 /* no more refs, free the structure */
2818                 mutex_exit(&z->zone_lock);
2819                 zone_free(z);
2820                 return;
2821         }
2822         /* signal zone_destroy so the zone can finish halting */
2823         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2824         mutex_exit(&z->zone_lock);
2825 
2826         if (wakeup) {
2827                 /*
2828                  * Grabbing zonehash_lock here effectively synchronizes with
2829                  * zone_destroy() to avoid missed signals.
2830                  */
2831                 mutex_enter(&zonehash_lock);
2832                 cv_broadcast(&zone_destroy_cv);
2833                 mutex_exit(&zonehash_lock);
2834         }
2835 }
2836 
2837 /*
2838  * Decrement the specified zone's reference count.  The specified zone will
2839  * cease to exist after this function returns if the reference count drops to
2840  * zero.  This function should be paired with zone_hold().
2841  */
2842 void
2843 zone_rele(zone_t *z)
2844 {
2845         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2846 }
2847 
2848 /*
2849  * Initialize a zone reference structure.  This function must be invoked for
2850  * a reference structure before the structure is passed to zone_hold_ref().
2851  */
2852 void
2853 zone_init_ref(zone_ref_t *ref)
2854 {
2855         ref->zref_zone = NULL;
2856         list_link_init(&ref->zref_linkage);
2857 }
2858 
2859 /*
2860  * Acquire a reference to zone z.  The caller must specify the
2861  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2862  * zone_ref_t structure will represent a reference to the specified zone.  Use
2863  * zone_rele_ref() to release the reference.
2864  *
2865  * The referenced zone_t structure will not be freed as long as the zone_t's
2866  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2867  * references.
2868  *
2869  * NOTE: The zone_ref_t structure must be initialized before it is used.
2870  * See zone_init_ref() above.
2871  */
2872 void
2873 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2874 {
2875         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2876 
2877         /*
2878          * Prevent consumers from reusing a reference structure before
2879          * releasing it.
2880          */
2881         VERIFY(ref->zref_zone == NULL);
2882 
2883         ref->zref_zone = z;
2884         mutex_enter(&z->zone_lock);
2885         zone_hold_locked(z);
2886         z->zone_subsys_ref[subsys]++;
2887         ASSERT(z->zone_subsys_ref[subsys] != 0);
2888         list_insert_head(&z->zone_ref_list, ref);
2889         mutex_exit(&z->zone_lock);
2890 }
2891 
2892 /*
2893  * Release the zone reference represented by the specified zone_ref_t.
2894  * The reference is invalid after it's released; however, the zone_ref_t
2895  * structure can be reused without having to invoke zone_init_ref().
2896  * subsys should be the same value that was passed to zone_hold_ref()
2897  * when the reference was acquired.
2898  */
2899 void
2900 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2901 {
2902         zone_rele_common(ref->zref_zone, ref, subsys);
2903 
2904         /*
2905          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2906          * when consumers dereference the reference.  This helps us catch
2907          * consumers who use released references.  Furthermore, this lets
2908          * consumers reuse the zone_ref_t structure without having to
2909          * invoke zone_init_ref().
2910          */
2911         ref->zref_zone = NULL;
2912 }
2913 
2914 void
2915 zone_cred_hold(zone_t *z)
2916 {
2917         mutex_enter(&z->zone_lock);
2918         z->zone_cred_ref++;
2919         ASSERT(z->zone_cred_ref != 0);
2920         mutex_exit(&z->zone_lock);
2921 }
2922 
2923 void
2924 zone_cred_rele(zone_t *z)
2925 {
2926         boolean_t wakeup;
2927 
2928         mutex_enter(&z->zone_lock);
2929         ASSERT(z->zone_cred_ref != 0);
2930         z->zone_cred_ref--;
2931         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2932                 /* no more refs, free the structure */
2933                 mutex_exit(&z->zone_lock);
2934                 zone_free(z);
2935                 return;
2936         }
2937         /*
2938          * If zone_destroy is waiting for the cred references to drain
2939          * out, and they have, signal it.
2940          */
2941         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2942             zone_status_get(z) >= ZONE_IS_DEAD);
2943         mutex_exit(&z->zone_lock);
2944 
2945         if (wakeup) {
2946                 /*
2947                  * Grabbing zonehash_lock here effectively synchronizes with
2948                  * zone_destroy() to avoid missed signals.
2949                  */
2950                 mutex_enter(&zonehash_lock);
2951                 cv_broadcast(&zone_destroy_cv);
2952                 mutex_exit(&zonehash_lock);
2953         }
2954 }
2955 
2956 void
2957 zone_task_hold(zone_t *z)
2958 {
2959         mutex_enter(&z->zone_lock);
2960         z->zone_ntasks++;
2961         ASSERT(z->zone_ntasks != 0);
2962         mutex_exit(&z->zone_lock);
2963 }
2964 
2965 void
2966 zone_task_rele(zone_t *zone)
2967 {
2968         uint_t refcnt;
2969 
2970         mutex_enter(&zone->zone_lock);
2971         ASSERT(zone->zone_ntasks != 0);
2972         refcnt = --zone->zone_ntasks;
2973         if (refcnt > 1)      {       /* Common case */
2974                 mutex_exit(&zone->zone_lock);
2975                 return;
2976         }
2977         zone_hold_locked(zone); /* so we can use the zone_t later */
2978         mutex_exit(&zone->zone_lock);
2979         if (refcnt == 1) {
2980                 /*
2981                  * See if the zone is shutting down.
2982                  */
2983                 mutex_enter(&zone_status_lock);
2984                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2985                         goto out;
2986                 }
2987 
2988                 /*
2989                  * Make sure the ntasks didn't change since we
2990                  * dropped zone_lock.
2991                  */
2992                 mutex_enter(&zone->zone_lock);
2993                 if (refcnt != zone->zone_ntasks) {
2994                         mutex_exit(&zone->zone_lock);
2995                         goto out;
2996                 }
2997                 mutex_exit(&zone->zone_lock);
2998 
2999                 /*
3000                  * No more user processes in the zone.  The zone is empty.
3001                  */
3002                 zone_status_set(zone, ZONE_IS_EMPTY);
3003                 goto out;
3004         }
3005 
3006         ASSERT(refcnt == 0);
3007         /*
3008          * zsched has exited; the zone is dead.
3009          */
3010         zone->zone_zsched = NULL;            /* paranoia */
3011         mutex_enter(&zone_status_lock);
3012         zone_status_set(zone, ZONE_IS_DEAD);
3013 out:
3014         mutex_exit(&zone_status_lock);
3015         zone_rele(zone);
3016 }
3017 
3018 zoneid_t
3019 getzoneid(void)
3020 {
3021         return (curproc->p_zone->zone_id);
3022 }
3023 
3024 zoneid_t
3025 getzonedid(void)
3026 {
3027         return (curproc->p_zone->zone_did);
3028 }
3029 
3030 /*
3031  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3032  * check the validity of a zone's state.
3033  */
3034 static zone_t *
3035 zone_find_all_by_id(zoneid_t zoneid)
3036 {
3037         mod_hash_val_t hv;
3038         zone_t *zone = NULL;
3039 
3040         ASSERT(MUTEX_HELD(&zonehash_lock));
3041 
3042         if (mod_hash_find(zonehashbyid,
3043             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3044                 zone = (zone_t *)hv;
3045         return (zone);
3046 }
3047 
3048 static zone_t *
3049 zone_find_all_by_label(const ts_label_t *label)
3050 {
3051         mod_hash_val_t hv;
3052         zone_t *zone = NULL;
3053 
3054         ASSERT(MUTEX_HELD(&zonehash_lock));
3055 
3056         /*
3057          * zonehashbylabel is not maintained for unlabeled systems
3058          */
3059         if (!is_system_labeled())
3060                 return (NULL);
3061         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3062                 zone = (zone_t *)hv;
3063         return (zone);
3064 }
3065 
3066 static zone_t *
3067 zone_find_all_by_name(char *name)
3068 {
3069         mod_hash_val_t hv;
3070         zone_t *zone = NULL;
3071 
3072         ASSERT(MUTEX_HELD(&zonehash_lock));
3073 
3074         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3075                 zone = (zone_t *)hv;
3076         return (zone);
3077 }
3078 
3079 /*
3080  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3081  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3082  * Caller must call zone_rele() once it is done with the zone.
3083  *
3084  * The zone may begin the zone_destroy() sequence immediately after this
3085  * function returns, but may be safely used until zone_rele() is called.
3086  */
3087 zone_t *
3088 zone_find_by_id(zoneid_t zoneid)
3089 {
3090         zone_t *zone;
3091         zone_status_t status;
3092 
3093         mutex_enter(&zonehash_lock);
3094         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3095                 mutex_exit(&zonehash_lock);
3096                 return (NULL);
3097         }
3098         status = zone_status_get(zone);
3099         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3100                 /*
3101                  * For all practical purposes the zone doesn't exist.
3102                  */
3103                 mutex_exit(&zonehash_lock);
3104                 return (NULL);
3105         }
3106         zone_hold(zone);
3107         mutex_exit(&zonehash_lock);
3108         return (zone);
3109 }
3110 
3111 /*
3112  * Similar to zone_find_by_id, but using zone label as the key.
3113  */
3114 zone_t *
3115 zone_find_by_label(const ts_label_t *label)
3116 {
3117         zone_t *zone;
3118         zone_status_t status;
3119 
3120         mutex_enter(&zonehash_lock);
3121         if ((zone = zone_find_all_by_label(label)) == NULL) {
3122                 mutex_exit(&zonehash_lock);
3123                 return (NULL);
3124         }
3125 
3126         status = zone_status_get(zone);
3127         if (status > ZONE_IS_DOWN) {
3128                 /*
3129                  * For all practical purposes the zone doesn't exist.
3130                  */
3131                 mutex_exit(&zonehash_lock);
3132                 return (NULL);
3133         }
3134         zone_hold(zone);
3135         mutex_exit(&zonehash_lock);
3136         return (zone);
3137 }
3138 
3139 /*
3140  * Similar to zone_find_by_id, but using zone name as the key.
3141  */
3142 zone_t *
3143 zone_find_by_name(char *name)
3144 {
3145         zone_t *zone;
3146         zone_status_t status;
3147 
3148         mutex_enter(&zonehash_lock);
3149         if ((zone = zone_find_all_by_name(name)) == NULL) {
3150                 mutex_exit(&zonehash_lock);
3151                 return (NULL);
3152         }
3153         status = zone_status_get(zone);
3154         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3155                 /*
3156                  * For all practical purposes the zone doesn't exist.
3157                  */
3158                 mutex_exit(&zonehash_lock);
3159                 return (NULL);
3160         }
3161         zone_hold(zone);
3162         mutex_exit(&zonehash_lock);
3163         return (zone);
3164 }
3165 
3166 /*
3167  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3168  * if there is a zone "foo" rooted at /foo/root, and the path argument
3169  * is "/foo/root/proc", it will return the held zone_t corresponding to
3170  * zone "foo".
3171  *
3172  * zone_find_by_path() always returns a non-NULL value, since at the
3173  * very least every path will be contained in the global zone.
3174  *
3175  * As with the other zone_find_by_*() functions, the caller is
3176  * responsible for zone_rele()ing the return value of this function.
3177  */
3178 zone_t *
3179 zone_find_by_path(const char *path)
3180 {
3181         zone_t *zone;
3182         zone_t *zret = NULL;
3183         zone_status_t status;
3184 
3185         if (path == NULL) {
3186                 /*
3187                  * Call from rootconf().
3188                  */
3189                 zone_hold(global_zone);
3190                 return (global_zone);
3191         }
3192         ASSERT(*path == '/');
3193         mutex_enter(&zonehash_lock);
3194         for (zone = list_head(&zone_active); zone != NULL;
3195             zone = list_next(&zone_active, zone)) {
3196                 if (ZONE_PATH_VISIBLE(path, zone))
3197                         zret = zone;
3198         }
3199         ASSERT(zret != NULL);
3200         status = zone_status_get(zret);
3201         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3202                 /*
3203                  * Zone practically doesn't exist.
3204                  */
3205                 zret = global_zone;
3206         }
3207         zone_hold(zret);
3208         mutex_exit(&zonehash_lock);
3209         return (zret);
3210 }
3211 
3212 /*
3213  * Public interface for updating per-zone load averages.  Called once per
3214  * second.
3215  *
3216  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3217  */
3218 void
3219 zone_loadavg_update(void)
3220 {
3221         zone_t *zp;
3222         zone_status_t status;
3223         struct loadavg_s *lavg;
3224         hrtime_t zone_total;
3225         uint64_t tmp;
3226         int i;
3227         hrtime_t hr_avg;
3228         int nrun;
3229         static int64_t f[3] = { 135, 27, 9 };
3230         int64_t q, r;
3231 
3232         mutex_enter(&zonehash_lock);
3233         for (zp = list_head(&zone_active); zp != NULL;
3234             zp = list_next(&zone_active, zp)) {
3235                 mutex_enter(&zp->zone_lock);
3236 
3237                 /* Skip zones that are on the way down or not yet up */
3238                 status = zone_status_get(zp);
3239                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3240                         /* For all practical purposes the zone doesn't exist. */
3241                         mutex_exit(&zp->zone_lock);
3242                         continue;
3243                 }
3244 
3245                 /*
3246                  * Update the 10 second moving average data in zone_loadavg.
3247                  */
3248                 lavg = &zp->zone_loadavg;
3249 
3250                 tmp = cpu_uarray_sum_all(zp->zone_ustate);
3251                 zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3252 
3253                 scalehrtime(&zone_total);
3254 
3255                 /* The zone_total should always be increasing. */
3256                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3257                     zone_total - lavg->lg_total : 0;
3258                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3259                 /* lg_total holds the prev. 1 sec. total */
3260                 lavg->lg_total = zone_total;
3261 
3262                 /*
3263                  * To simplify the calculation, we don't calculate the load avg.
3264                  * until the zone has been up for at least 10 seconds and our
3265                  * moving average is thus full.
3266                  */
3267                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3268                         lavg->lg_len++;
3269                         mutex_exit(&zp->zone_lock);
3270                         continue;
3271                 }
3272 
3273                 /* Now calculate the 1min, 5min, 15 min load avg. */
3274                 hr_avg = 0;
3275                 for (i = 0; i < S_LOADAVG_SZ; i++)
3276                         hr_avg += lavg->lg_loads[i];
3277                 hr_avg = hr_avg / S_LOADAVG_SZ;
3278                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3279 
3280                 /* Compute load avg. See comment in calcloadavg() */
3281                 for (i = 0; i < 3; i++) {
3282                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3283                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3284                         zp->zone_hp_avenrun[i] +=
3285                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3286 
3287                         /* avenrun[] can only hold 31 bits of load avg. */
3288                         if (zp->zone_hp_avenrun[i] <
3289                             ((uint64_t)1<<(31+16-FSHIFT)))
3290                                 zp->zone_avenrun[i] = (int32_t)
3291                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3292                         else
3293                                 zp->zone_avenrun[i] = 0x7fffffff;
3294                 }
3295 
3296                 mutex_exit(&zp->zone_lock);
3297         }
3298         mutex_exit(&zonehash_lock);
3299 }
3300 
3301 /*
3302  * Get the number of cpus visible to this zone.  The system-wide global
3303  * 'ncpus' is returned if pools are disabled, the caller is in the
3304  * global zone, or a NULL zone argument is passed in.
3305  */
3306 int
3307 zone_ncpus_get(zone_t *zone)
3308 {
3309         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3310 
3311         return (myncpus != 0 ? myncpus : ncpus);
3312 }
3313 
3314 /*
3315  * Get the number of online cpus visible to this zone.  The system-wide
3316  * global 'ncpus_online' is returned if pools are disabled, the caller
3317  * is in the global zone, or a NULL zone argument is passed in.
3318  */
3319 int
3320 zone_ncpus_online_get(zone_t *zone)
3321 {
3322         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3323 
3324         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3325 }
3326 
3327 /*
3328  * Return the pool to which the zone is currently bound.
3329  */
3330 pool_t *
3331 zone_pool_get(zone_t *zone)
3332 {
3333         ASSERT(pool_lock_held());
3334 
3335         return (zone->zone_pool);
3336 }
3337 
3338 /*
3339  * Set the zone's pool pointer and update the zone's visibility to match
3340  * the resources in the new pool.
3341  */
3342 void
3343 zone_pool_set(zone_t *zone, pool_t *pool)
3344 {
3345         ASSERT(pool_lock_held());
3346         ASSERT(MUTEX_HELD(&cpu_lock));
3347 
3348         zone->zone_pool = pool;
3349         zone_pset_set(zone, pool->pool_pset->pset_id);
3350 }
3351 
3352 /*
3353  * Return the cached value of the id of the processor set to which the
3354  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3355  * facility is disabled.
3356  */
3357 psetid_t
3358 zone_pset_get(zone_t *zone)
3359 {
3360         ASSERT(MUTEX_HELD(&cpu_lock));
3361 
3362         return (zone->zone_psetid);
3363 }
3364 
3365 /*
3366  * Set the cached value of the id of the processor set to which the zone
3367  * is currently bound.  Also update the zone's visibility to match the
3368  * resources in the new processor set.
3369  */
3370 void
3371 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3372 {
3373         psetid_t oldpsetid;
3374 
3375         ASSERT(MUTEX_HELD(&cpu_lock));
3376         oldpsetid = zone_pset_get(zone);
3377 
3378         if (oldpsetid == newpsetid)
3379                 return;
3380         /*
3381          * Global zone sees all.
3382          */
3383         if (zone != global_zone) {
3384                 zone->zone_psetid = newpsetid;
3385                 if (newpsetid != ZONE_PS_INVAL)
3386                         pool_pset_visibility_add(newpsetid, zone);
3387                 if (oldpsetid != ZONE_PS_INVAL)
3388                         pool_pset_visibility_remove(oldpsetid, zone);
3389         }
3390         /*
3391          * Disabling pools, so we should start using the global values
3392          * for ncpus and ncpus_online.
3393          */
3394         if (newpsetid == ZONE_PS_INVAL) {
3395                 zone->zone_ncpus = 0;
3396                 zone->zone_ncpus_online = 0;
3397         }
3398 }
3399 
3400 /*
3401  * Walk the list of active zones and issue the provided callback for
3402  * each of them.
3403  *
3404  * Caller must not be holding any locks that may be acquired under
3405  * zonehash_lock.  See comment at the beginning of the file for a list of
3406  * common locks and their interactions with zones.
3407  */
3408 int
3409 zone_walk(int (*cb)(zone_t *, void *), void *data)
3410 {
3411         zone_t *zone;
3412         int ret = 0;
3413         zone_status_t status;
3414 
3415         mutex_enter(&zonehash_lock);
3416         for (zone = list_head(&zone_active); zone != NULL;
3417             zone = list_next(&zone_active, zone)) {
3418                 /*
3419                  * Skip zones that shouldn't be externally visible.
3420                  */
3421                 status = zone_status_get(zone);
3422                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3423                         continue;
3424                 /*
3425                  * Bail immediately if any callback invocation returns a
3426                  * non-zero value.
3427                  */
3428                 ret = (*cb)(zone, data);
3429                 if (ret != 0)
3430                         break;
3431         }
3432         mutex_exit(&zonehash_lock);
3433         return (ret);
3434 }
3435 
3436 static int
3437 zone_set_root(zone_t *zone, const char *upath)
3438 {
3439         vnode_t *vp;
3440         int trycount;
3441         int error = 0;
3442         char *path;
3443         struct pathname upn, pn;
3444         size_t pathlen;
3445 
3446         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3447                 return (error);
3448 
3449         pn_alloc(&pn);
3450 
3451         /* prevent infinite loop */
3452         trycount = 10;
3453         for (;;) {
3454                 if (--trycount <= 0) {
3455                         error = ESTALE;
3456                         goto out;
3457                 }
3458 
3459                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3460                         /*
3461                          * VOP_ACCESS() may cover 'vp' with a new
3462                          * filesystem, if 'vp' is an autoFS vnode.
3463                          * Get the new 'vp' if so.
3464                          */
3465                         if ((error =
3466                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3467                             (!vn_ismntpt(vp) ||
3468                             (error = traverse(&vp)) == 0)) {
3469                                 pathlen = pn.pn_pathlen + 2;
3470                                 path = kmem_alloc(pathlen, KM_SLEEP);
3471                                 (void) strncpy(path, pn.pn_path,
3472                                     pn.pn_pathlen + 1);
3473                                 path[pathlen - 2] = '/';
3474                                 path[pathlen - 1] = '\0';
3475                                 pn_free(&pn);
3476                                 pn_free(&upn);
3477 
3478                                 /* Success! */
3479                                 break;
3480                         }
3481                         VN_RELE(vp);
3482                 }
3483                 if (error != ESTALE)
3484                         goto out;
3485         }
3486 
3487         ASSERT(error == 0);
3488         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3489         zone->zone_rootpath = path;
3490         zone->zone_rootpathlen = pathlen;
3491         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3492                 zone->zone_flags |= ZF_IS_SCRATCH;
3493         return (0);
3494 
3495 out:
3496         pn_free(&pn);
3497         pn_free(&upn);
3498         return (error);
3499 }
3500 
3501 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3502                         ((c) >= 'a' && (c) <= 'z') || \
3503                         ((c) >= 'A' && (c) <= 'Z'))
3504 
3505 static int
3506 zone_set_name(zone_t *zone, const char *uname)
3507 {
3508         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3509         size_t len;
3510         int i, err;
3511 
3512         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3513                 kmem_free(kname, ZONENAME_MAX);
3514                 return (err);   /* EFAULT or ENAMETOOLONG */
3515         }
3516 
3517         /* must be less than ZONENAME_MAX */
3518         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3519                 kmem_free(kname, ZONENAME_MAX);
3520                 return (EINVAL);
3521         }
3522 
3523         /*
3524          * Name must start with an alphanumeric and must contain only
3525          * alphanumerics, '-', '_' and '.'.
3526          */
3527         if (!isalnum(kname[0])) {
3528                 kmem_free(kname, ZONENAME_MAX);
3529                 return (EINVAL);
3530         }
3531         for (i = 1; i < len - 1; i++) {
3532                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3533                     kname[i] != '.') {
3534                         kmem_free(kname, ZONENAME_MAX);
3535                         return (EINVAL);
3536                 }
3537         }
3538 
3539         zone->zone_name = kname;
3540         return (0);
3541 }
3542 
3543 /*
3544  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3545  * is NULL or it points to a zone with no hostid emulation, then the machine's
3546  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3547  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3548  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3549  * hostid and the machine's hostid is invalid.
3550  */
3551 uint32_t
3552 zone_get_hostid(zone_t *zonep)
3553 {
3554         unsigned long machine_hostid;
3555 
3556         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3557                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3558                         return (HW_INVALID_HOSTID);
3559                 return ((uint32_t)machine_hostid);
3560         }
3561         return (zonep->zone_hostid);
3562 }
3563 
3564 /*
3565  * Similar to thread_create(), but makes sure the thread is in the appropriate
3566  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3567  */
3568 /*ARGSUSED*/
3569 kthread_t *
3570 zthread_create(
3571     caddr_t stk,
3572     size_t stksize,
3573     void (*proc)(),
3574     void *arg,
3575     size_t len,
3576     pri_t pri)
3577 {
3578         kthread_t *t;
3579         zone_t *zone = curproc->p_zone;
3580         proc_t *pp = zone->zone_zsched;
3581 
3582         zone_hold(zone);        /* Reference to be dropped when thread exits */
3583 
3584         /*
3585          * No-one should be trying to create threads if the zone is shutting
3586          * down and there aren't any kernel threads around.  See comment
3587          * in zthread_exit().
3588          */
3589         ASSERT(!(zone->zone_kthreads == NULL &&
3590             zone_status_get(zone) >= ZONE_IS_EMPTY));
3591         /*
3592          * Create a thread, but don't let it run until we've finished setting
3593          * things up.
3594          */
3595         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3596         ASSERT(t->t_forw == NULL);
3597         mutex_enter(&zone_status_lock);
3598         if (zone->zone_kthreads == NULL) {
3599                 t->t_forw = t->t_back = t;
3600         } else {
3601                 kthread_t *tx = zone->zone_kthreads;
3602 
3603                 t->t_forw = tx;
3604                 t->t_back = tx->t_back;
3605                 tx->t_back->t_forw = t;
3606                 tx->t_back = t;
3607         }
3608         zone->zone_kthreads = t;
3609         mutex_exit(&zone_status_lock);
3610 
3611         mutex_enter(&pp->p_lock);
3612         t->t_proc_flag |= TP_ZTHREAD;
3613         project_rele(t->t_proj);
3614         t->t_proj = project_hold(pp->p_task->tk_proj);
3615 
3616         /*
3617          * Setup complete, let it run.
3618          */
3619         thread_lock(t);
3620         t->t_schedflag |= TS_ALLSTART;
3621         setrun_locked(t);
3622         thread_unlock(t);
3623 
3624         mutex_exit(&pp->p_lock);
3625 
3626         return (t);
3627 }
3628 
3629 /*
3630  * Similar to thread_exit().  Must be called by threads created via
3631  * zthread_exit().
3632  */
3633 void
3634 zthread_exit(void)
3635 {
3636         kthread_t *t = curthread;
3637         proc_t *pp = curproc;
3638         zone_t *zone = pp->p_zone;
3639 
3640         mutex_enter(&zone_status_lock);
3641 
3642         /*
3643          * Reparent to p0
3644          */
3645         kpreempt_disable();
3646         mutex_enter(&pp->p_lock);
3647         t->t_proc_flag &= ~TP_ZTHREAD;
3648         t->t_procp = &p0;
3649         hat_thread_exit(t);
3650         mutex_exit(&pp->p_lock);
3651         kpreempt_enable();
3652 
3653         if (t->t_back == t) {
3654                 ASSERT(t->t_forw == t);
3655                 /*
3656                  * If the zone is empty, once the thread count
3657                  * goes to zero no further kernel threads can be
3658                  * created.  This is because if the creator is a process
3659                  * in the zone, then it must have exited before the zone
3660                  * state could be set to ZONE_IS_EMPTY.
3661                  * Otherwise, if the creator is a kernel thread in the
3662                  * zone, the thread count is non-zero.
3663                  *
3664                  * This really means that non-zone kernel threads should
3665                  * not create zone kernel threads.
3666                  */
3667                 zone->zone_kthreads = NULL;
3668                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3669                         zone_status_set(zone, ZONE_IS_DOWN);
3670                         /*
3671                          * Remove any CPU caps on this zone.
3672                          */
3673                         cpucaps_zone_remove(zone);
3674                 }
3675         } else {
3676                 t->t_forw->t_back = t->t_back;
3677                 t->t_back->t_forw = t->t_forw;
3678                 if (zone->zone_kthreads == t)
3679                         zone->zone_kthreads = t->t_forw;
3680         }
3681         mutex_exit(&zone_status_lock);
3682         zone_rele(zone);
3683         thread_exit();
3684         /* NOTREACHED */
3685 }
3686 
3687 static void
3688 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3689 {
3690         vnode_t *oldvp;
3691 
3692         /* we're going to hold a reference here to the directory */
3693         VN_HOLD(vp);
3694 
3695         /* update abs cwd/root path see c2/audit.c */
3696         if (AU_AUDITING())
3697                 audit_chdirec(vp, vpp);
3698 
3699         mutex_enter(&pp->p_lock);
3700         oldvp = *vpp;
3701         *vpp = vp;
3702         mutex_exit(&pp->p_lock);
3703         if (oldvp != NULL)
3704                 VN_RELE(oldvp);
3705 }
3706 
3707 /*
3708  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3709  */
3710 static int
3711 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3712 {
3713         nvpair_t *nvp = NULL;
3714         boolean_t priv_set = B_FALSE;
3715         boolean_t limit_set = B_FALSE;
3716         boolean_t action_set = B_FALSE;
3717 
3718         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3719                 const char *name;
3720                 uint64_t ui64;
3721 
3722                 name = nvpair_name(nvp);
3723                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3724                         return (EINVAL);
3725                 (void) nvpair_value_uint64(nvp, &ui64);
3726                 if (strcmp(name, "privilege") == 0) {
3727                         /*
3728                          * Currently only privileged values are allowed, but
3729                          * this may change in the future.
3730                          */
3731                         if (ui64 != RCPRIV_PRIVILEGED)
3732                                 return (EINVAL);
3733                         rv->rcv_privilege = ui64;
3734                         priv_set = B_TRUE;
3735                 } else if (strcmp(name, "limit") == 0) {
3736                         rv->rcv_value = ui64;
3737                         limit_set = B_TRUE;
3738                 } else if (strcmp(name, "action") == 0) {
3739                         if (ui64 != RCTL_LOCAL_NOACTION &&
3740                             ui64 != RCTL_LOCAL_DENY)
3741                                 return (EINVAL);
3742                         rv->rcv_flagaction = ui64;
3743                         action_set = B_TRUE;
3744                 } else {
3745                         return (EINVAL);
3746                 }
3747         }
3748 
3749         if (!(priv_set && limit_set && action_set))
3750                 return (EINVAL);
3751         rv->rcv_action_signal = 0;
3752         rv->rcv_action_recipient = NULL;
3753         rv->rcv_action_recip_pid = -1;
3754         rv->rcv_firing_time = 0;
3755 
3756         return (0);
3757 }
3758 
3759 /*
3760  * Non-global zone version of start_init.
3761  */
3762 void
3763 zone_start_init(void)
3764 {
3765         proc_t *p = ttoproc(curthread);
3766         zone_t *z = p->p_zone;
3767 
3768         ASSERT(!INGLOBALZONE(curproc));
3769 
3770         /*
3771          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3772          * storing just the pid of init is sufficient.
3773          */
3774         z->zone_proc_initpid = p->p_pid;
3775 
3776         /*
3777          * We maintain zone_boot_err so that we can return the cause of the
3778          * failure back to the caller of the zone_boot syscall.
3779          */
3780         p->p_zone->zone_boot_err = start_init_common();
3781 
3782         /*
3783          * We will prevent booting zones from becoming running zones if the
3784          * global zone is shutting down.
3785          */
3786         mutex_enter(&zone_status_lock);
3787         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3788             ZONE_IS_SHUTTING_DOWN) {
3789                 /*
3790                  * Make sure we are still in the booting state-- we could have
3791                  * raced and already be shutting down, or even further along.
3792                  */
3793                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3794                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3795                 }
3796                 mutex_exit(&zone_status_lock);
3797                 /* It's gone bad, dispose of the process */
3798                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3799                         mutex_enter(&p->p_lock);
3800                         ASSERT(p->p_flag & SEXITLWPS);
3801                         lwp_exit();
3802                 }
3803         } else {
3804                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3805                         zone_status_set(z, ZONE_IS_RUNNING);
3806                 mutex_exit(&zone_status_lock);
3807                 /* cause the process to return to userland. */
3808                 lwp_rtt();
3809         }
3810 }
3811 
3812 struct zsched_arg {
3813         zone_t *zone;
3814         nvlist_t *nvlist;
3815 };
3816 
3817 /*
3818  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3819  * anything to do with scheduling, but rather with the fact that
3820  * per-zone kernel threads are parented to zsched, just like regular
3821  * kernel threads are parented to sched (p0).
3822  *
3823  * zsched is also responsible for launching init for the zone.
3824  */
3825 static void
3826 zsched(void *arg)
3827 {
3828         struct zsched_arg *za = arg;
3829         proc_t *pp = curproc;
3830         proc_t *initp = proc_init;
3831         zone_t *zone = za->zone;
3832         cred_t *cr, *oldcred;
3833         rctl_set_t *set;
3834         rctl_alloc_gp_t *gp;
3835         contract_t *ct = NULL;
3836         task_t *tk, *oldtk;
3837         rctl_entity_p_t e;
3838         kproject_t *pj;
3839 
3840         nvlist_t *nvl = za->nvlist;
3841         nvpair_t *nvp = NULL;
3842 
3843         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3844         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3845         PTOU(pp)->u_argc = 0;
3846         PTOU(pp)->u_argv = 0;
3847         PTOU(pp)->u_envp = 0;
3848         PTOU(pp)->u_commpagep = 0;
3849         closeall(P_FINFO(pp));
3850 
3851         /*
3852          * We are this zone's "zsched" process.  As the zone isn't generally
3853          * visible yet we don't need to grab any locks before initializing its
3854          * zone_proc pointer.
3855          */
3856         zone_hold(zone);  /* this hold is released by zone_destroy() */
3857         zone->zone_zsched = pp;
3858         mutex_enter(&pp->p_lock);
3859         pp->p_zone = zone;
3860         mutex_exit(&pp->p_lock);
3861 
3862         /*
3863          * Disassociate process from its 'parent'; parent ourselves to init
3864          * (pid 1) and change other values as needed.
3865          */
3866         sess_create();
3867 
3868         mutex_enter(&pidlock);
3869         proc_detach(pp);
3870         pp->p_ppid = 1;
3871         pp->p_flag |= SZONETOP;
3872         pp->p_ancpid = 1;
3873         pp->p_parent = initp;
3874         pp->p_psibling = NULL;
3875         if (initp->p_child)
3876                 initp->p_child->p_psibling = pp;
3877         pp->p_sibling = initp->p_child;
3878         initp->p_child = pp;
3879 
3880         /* Decrement what newproc() incremented. */
3881         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3882         /*
3883          * Our credentials are about to become kcred-like, so we don't care
3884          * about the caller's ruid.
3885          */
3886         upcount_inc(crgetruid(kcred), zone->zone_id);
3887         mutex_exit(&pidlock);
3888 
3889         /*
3890          * getting out of global zone, so decrement lwp and process counts
3891          */
3892         pj = pp->p_task->tk_proj;
3893         mutex_enter(&global_zone->zone_nlwps_lock);
3894         pj->kpj_nlwps -= pp->p_lwpcnt;
3895         global_zone->zone_nlwps -= pp->p_lwpcnt;
3896         pj->kpj_nprocs--;
3897         global_zone->zone_nprocs--;
3898         mutex_exit(&global_zone->zone_nlwps_lock);
3899 
3900         /*
3901          * Decrement locked memory counts on old zone and project.
3902          */
3903         mutex_enter(&global_zone->zone_mem_lock);
3904         global_zone->zone_locked_mem -= pp->p_locked_mem;
3905         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3906         mutex_exit(&global_zone->zone_mem_lock);
3907 
3908         /*
3909          * Create and join a new task in project '0' of this zone.
3910          *
3911          * We don't need to call holdlwps() since we know we're the only lwp in
3912          * this process.
3913          *
3914          * task_join() returns with p_lock held.
3915          */
3916         tk = task_create(0, zone);
3917         mutex_enter(&cpu_lock);
3918         oldtk = task_join(tk, 0);
3919 
3920         pj = pp->p_task->tk_proj;
3921 
3922         mutex_enter(&zone->zone_mem_lock);
3923         zone->zone_locked_mem += pp->p_locked_mem;
3924         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3925         mutex_exit(&zone->zone_mem_lock);
3926 
3927         /*
3928          * add lwp and process counts to zsched's zone, and increment
3929          * project's task and process count due to the task created in
3930          * the above task_create.
3931          */
3932         mutex_enter(&zone->zone_nlwps_lock);
3933         pj->kpj_nlwps += pp->p_lwpcnt;
3934         pj->kpj_ntasks += 1;
3935         zone->zone_nlwps += pp->p_lwpcnt;
3936         pj->kpj_nprocs++;
3937         zone->zone_nprocs++;
3938         mutex_exit(&zone->zone_nlwps_lock);
3939 
3940         mutex_exit(&curproc->p_lock);
3941         mutex_exit(&cpu_lock);
3942         task_rele(oldtk);
3943 
3944         /*
3945          * The process was created by a process in the global zone, hence the
3946          * credentials are wrong.  We might as well have kcred-ish credentials.
3947          */
3948         cr = zone->zone_kcred;
3949         crhold(cr);
3950         mutex_enter(&pp->p_crlock);
3951         oldcred = pp->p_cred;
3952         pp->p_cred = cr;
3953         mutex_exit(&pp->p_crlock);
3954         crfree(oldcred);
3955 
3956         /*
3957          * Hold credentials again (for thread)
3958          */
3959         crhold(cr);
3960 
3961         /*
3962          * p_lwpcnt can't change since this is a kernel process.
3963          */
3964         crset(pp, cr);
3965 
3966         /*
3967          * Chroot
3968          */
3969         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3970         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3971 
3972         /*
3973          * Initialize zone's rctl set.
3974          */
3975         set = rctl_set_create();
3976         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3977         mutex_enter(&pp->p_lock);
3978         e.rcep_p.zone = zone;
3979         e.rcep_t = RCENTITY_ZONE;
3980         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3981         mutex_exit(&pp->p_lock);
3982         rctl_prealloc_destroy(gp);
3983 
3984         /*
3985          * Apply the rctls passed in to zone_create().  This is basically a list
3986          * assignment: all of the old values are removed and the new ones
3987          * inserted.  That is, if an empty list is passed in, all values are
3988          * removed.
3989          */
3990         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3991                 rctl_dict_entry_t *rde;
3992                 rctl_hndl_t hndl;
3993                 char *name;
3994                 nvlist_t **nvlarray;
3995                 uint_t i, nelem;
3996                 int error;      /* For ASSERT()s */
3997 
3998                 name = nvpair_name(nvp);
3999                 hndl = rctl_hndl_lookup(name);
4000                 ASSERT(hndl != -1);
4001                 rde = rctl_dict_lookup_hndl(hndl);
4002                 ASSERT(rde != NULL);
4003 
4004                 for (; /* ever */; ) {
4005                         rctl_val_t oval;
4006 
4007                         mutex_enter(&pp->p_lock);
4008                         error = rctl_local_get(hndl, NULL, &oval, pp);
4009                         mutex_exit(&pp->p_lock);
4010                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4011                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4012                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4013                                 break;
4014                         mutex_enter(&pp->p_lock);
4015                         error = rctl_local_delete(hndl, &oval, pp);
4016                         mutex_exit(&pp->p_lock);
4017                         ASSERT(error == 0);
4018                 }
4019                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4020                 ASSERT(error == 0);
4021                 for (i = 0; i < nelem; i++) {
4022                         rctl_val_t *nvalp;
4023 
4024                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4025                         error = nvlist2rctlval(nvlarray[i], nvalp);
4026                         ASSERT(error == 0);
4027                         /*
4028                          * rctl_local_insert can fail if the value being
4029                          * inserted is a duplicate; this is OK.
4030                          */
4031                         mutex_enter(&pp->p_lock);
4032                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4033                                 kmem_cache_free(rctl_val_cache, nvalp);
4034                         mutex_exit(&pp->p_lock);
4035                 }
4036         }
4037 
4038         /*
4039          * Tell the world that we're done setting up.
4040          *
4041          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4042          * and atomically set the zone's processor set visibility.  Once
4043          * we drop pool_lock() this zone will automatically get updated
4044          * to reflect any future changes to the pools configuration.
4045          *
4046          * Note that after we drop the locks below (zonehash_lock in
4047          * particular) other operations such as a zone_getattr call can
4048          * now proceed and observe the zone. That is the reason for doing a
4049          * state transition to the INITIALIZED state.
4050          */
4051         pool_lock();
4052         mutex_enter(&cpu_lock);
4053         mutex_enter(&zonehash_lock);
4054         zone_uniqid(zone);
4055         zone_zsd_configure(zone);
4056         if (pool_state == POOL_ENABLED)
4057                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4058         mutex_enter(&zone_status_lock);
4059         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4060         zone_status_set(zone, ZONE_IS_INITIALIZED);
4061         mutex_exit(&zone_status_lock);
4062         mutex_exit(&zonehash_lock);
4063         mutex_exit(&cpu_lock);
4064         pool_unlock();
4065 
4066         /* Now call the create callback for this key */
4067         zsd_apply_all_keys(zsd_apply_create, zone);
4068 
4069         /* The callbacks are complete. Mark ZONE_IS_READY */
4070         mutex_enter(&zone_status_lock);
4071         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4072         zone_status_set(zone, ZONE_IS_READY);
4073         mutex_exit(&zone_status_lock);
4074 
4075         /*
4076          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4077          * we launch init, and set the state to running.
4078          */
4079         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4080 
4081         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4082                 id_t cid;
4083 
4084                 /*
4085                  * Ok, this is a little complicated.  We need to grab the
4086                  * zone's pool's scheduling class ID; note that by now, we
4087                  * are already bound to a pool if we need to be (zoneadmd
4088                  * will have done that to us while we're in the READY
4089                  * state).  *But* the scheduling class for the zone's 'init'
4090                  * must be explicitly passed to newproc, which doesn't
4091                  * respect pool bindings.
4092                  *
4093                  * We hold the pool_lock across the call to newproc() to
4094                  * close the obvious race: the pool's scheduling class
4095                  * could change before we manage to create the LWP with
4096                  * classid 'cid'.
4097                  */
4098                 pool_lock();
4099                 if (zone->zone_defaultcid > 0)
4100                         cid = zone->zone_defaultcid;
4101                 else
4102                         cid = pool_get_class(zone->zone_pool);
4103                 if (cid == -1)
4104                         cid = defaultcid;
4105 
4106                 /*
4107                  * If this fails, zone_boot will ultimately fail.  The
4108                  * state of the zone will be set to SHUTTING_DOWN-- userland
4109                  * will have to tear down the zone, and fail, or try again.
4110                  */
4111                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4112                     minclsyspri - 1, &ct, 0)) != 0) {
4113                         mutex_enter(&zone_status_lock);
4114                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4115                         mutex_exit(&zone_status_lock);
4116                 } else {
4117                         zone->zone_boot_time = gethrestime_sec();
4118                 }
4119 
4120                 pool_unlock();
4121         }
4122 
4123         /*
4124          * Wait for zone_destroy() to be called.  This is what we spend
4125          * most of our life doing.
4126          */
4127         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4128 
4129         if (ct)
4130                 /*
4131                  * At this point the process contract should be empty.
4132                  * (Though if it isn't, it's not the end of the world.)
4133                  */
4134                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4135 
4136         /*
4137          * Allow kcred to be freed when all referring processes
4138          * (including this one) go away.  We can't just do this in
4139          * zone_free because we need to wait for the zone_cred_ref to
4140          * drop to 0 before calling zone_free, and the existence of
4141          * zone_kcred will prevent that.  Thus, we call crfree here to
4142          * balance the crdup in zone_create.  The crhold calls earlier
4143          * in zsched will be dropped when the thread and process exit.
4144          */
4145         crfree(zone->zone_kcred);
4146         zone->zone_kcred = NULL;
4147 
4148         exit(CLD_EXITED, 0);
4149 }
4150 
4151 /*
4152  * Helper function to determine if there are any submounts of the
4153  * provided path.  Used to make sure the zone doesn't "inherit" any
4154  * mounts from before it is created.
4155  */
4156 static uint_t
4157 zone_mount_count(const char *rootpath)
4158 {
4159         vfs_t *vfsp;
4160         uint_t count = 0;
4161         size_t rootpathlen = strlen(rootpath);
4162 
4163         /*
4164          * Holding zonehash_lock prevents race conditions with
4165          * vfs_list_add()/vfs_list_remove() since we serialize with
4166          * zone_find_by_path().
4167          */
4168         ASSERT(MUTEX_HELD(&zonehash_lock));
4169         /*
4170          * The rootpath must end with a '/'
4171          */
4172         ASSERT(rootpath[rootpathlen - 1] == '/');
4173 
4174         /*
4175          * This intentionally does not count the rootpath itself if that
4176          * happens to be a mount point.
4177          */
4178         vfs_list_read_lock();
4179         vfsp = rootvfs;
4180         do {
4181                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4182                     rootpathlen) == 0)
4183                         count++;
4184                 vfsp = vfsp->vfs_next;
4185         } while (vfsp != rootvfs);
4186         vfs_list_unlock();
4187         return (count);
4188 }
4189 
4190 /*
4191  * Helper function to make sure that a zone created on 'rootpath'
4192  * wouldn't end up containing other zones' rootpaths.
4193  */
4194 static boolean_t
4195 zone_is_nested(const char *rootpath)
4196 {
4197         zone_t *zone;
4198         size_t rootpathlen = strlen(rootpath);
4199         size_t len;
4200 
4201         ASSERT(MUTEX_HELD(&zonehash_lock));
4202 
4203         /*
4204          * zone_set_root() appended '/' and '\0' at the end of rootpath
4205          */
4206         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4207             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4208                 return (B_TRUE);
4209 
4210         for (zone = list_head(&zone_active); zone != NULL;
4211             zone = list_next(&zone_active, zone)) {
4212                 if (zone == global_zone)
4213                         continue;
4214                 len = strlen(zone->zone_rootpath);
4215                 if (strncmp(rootpath, zone->zone_rootpath,
4216                     MIN(rootpathlen, len)) == 0)
4217                         return (B_TRUE);
4218         }
4219         return (B_FALSE);
4220 }
4221 
4222 static int
4223 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4224     size_t zone_privssz)
4225 {
4226         priv_set_t *privs;
4227 
4228         if (zone_privssz < sizeof (priv_set_t))
4229                 return (ENOMEM);
4230 
4231         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4232 
4233         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4234                 kmem_free(privs, sizeof (priv_set_t));
4235                 return (EFAULT);
4236         }
4237 
4238         zone->zone_privset = privs;
4239         return (0);
4240 }
4241 
4242 /*
4243  * We make creative use of nvlists to pass in rctls from userland.  The list is
4244  * a list of the following structures:
4245  *
4246  * (name = rctl_name, value = nvpair_list_array)
4247  *
4248  * Where each element of the nvpair_list_array is of the form:
4249  *
4250  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4251  *      (name = "limit", value = uint64_t),
4252  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4253  */
4254 static int
4255 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4256 {
4257         nvpair_t *nvp = NULL;
4258         nvlist_t *nvl = NULL;
4259         char *kbuf;
4260         int error;
4261         rctl_val_t rv;
4262 
4263         *nvlp = NULL;
4264 
4265         if (buflen == 0)
4266                 return (0);
4267 
4268         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4269                 return (ENOMEM);
4270         if (copyin(ubuf, kbuf, buflen)) {
4271                 error = EFAULT;
4272                 goto out;
4273         }
4274         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4275                 /*
4276                  * nvl may have been allocated/free'd, but the value set to
4277                  * non-NULL, so we reset it here.
4278                  */
4279                 nvl = NULL;
4280                 error = EINVAL;
4281                 goto out;
4282         }
4283         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4284                 rctl_dict_entry_t *rde;
4285                 rctl_hndl_t hndl;
4286                 nvlist_t **nvlarray;
4287                 uint_t i, nelem;
4288                 char *name;
4289 
4290                 error = EINVAL;
4291                 name = nvpair_name(nvp);
4292                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4293                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4294                         goto out;
4295                 }
4296                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4297                         goto out;
4298                 }
4299                 rde = rctl_dict_lookup_hndl(hndl);
4300                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4301                 ASSERT(error == 0);
4302                 for (i = 0; i < nelem; i++) {
4303                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4304                                 goto out;
4305                 }
4306                 if (rctl_invalid_value(rde, &rv)) {
4307                         error = EINVAL;
4308                         goto out;
4309                 }
4310         }
4311         error = 0;
4312         *nvlp = nvl;
4313 out:
4314         kmem_free(kbuf, buflen);
4315         if (error && nvl != NULL)
4316                 nvlist_free(nvl);
4317         return (error);
4318 }
4319 
4320 int
4321 zone_create_error(int er_error, int er_ext, int *er_out)
4322 {
4323         if (er_out != NULL) {
4324                 if (copyout(&er_ext, er_out, sizeof (int))) {
4325                         return (set_errno(EFAULT));
4326                 }
4327         }
4328         return (set_errno(er_error));
4329 }
4330 
4331 static int
4332 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4333 {
4334         ts_label_t *tsl;
4335         bslabel_t blab;
4336 
4337         /* Get label from user */
4338         if (copyin(lab, &blab, sizeof (blab)) != 0)
4339                 return (EFAULT);
4340         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4341         if (tsl == NULL)
4342                 return (ENOMEM);
4343 
4344         zone->zone_slabel = tsl;
4345         return (0);
4346 }
4347 
4348 /*
4349  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4350  */
4351 static int
4352 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4353 {
4354         char *kbuf;
4355         char *dataset, *next;
4356         zone_dataset_t *zd;
4357         size_t len;
4358 
4359         if (ubuf == NULL || buflen == 0)
4360                 return (0);
4361 
4362         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4363                 return (ENOMEM);
4364 
4365         if (copyin(ubuf, kbuf, buflen) != 0) {
4366                 kmem_free(kbuf, buflen);
4367                 return (EFAULT);
4368         }
4369 
4370         dataset = next = kbuf;
4371         for (;;) {
4372                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4373 
4374                 next = strchr(dataset, ',');
4375 
4376                 if (next == NULL)
4377                         len = strlen(dataset);
4378                 else
4379                         len = next - dataset;
4380 
4381                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4382                 bcopy(dataset, zd->zd_dataset, len);
4383                 zd->zd_dataset[len] = '\0';
4384 
4385                 list_insert_head(&zone->zone_datasets, zd);
4386 
4387                 if (next == NULL)
4388                         break;
4389 
4390                 dataset = next + 1;
4391         }
4392 
4393         kmem_free(kbuf, buflen);
4394         return (0);
4395 }
4396 
4397 /*
4398  * System call to create/initialize a new zone named 'zone_name', rooted
4399  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4400  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4401  * with labeling set by 'match', 'doi', and 'label'.
4402  *
4403  * If extended error is non-null, we may use it to return more detailed
4404  * error information.
4405  */
4406 static zoneid_t
4407 zone_create(const char *zone_name, const char *zone_root,
4408     const priv_set_t *zone_privs, size_t zone_privssz,
4409     caddr_t rctlbuf, size_t rctlbufsz,
4410     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4411     int match, uint32_t doi, const bslabel_t *label,
4412     int flags, zoneid_t zone_did)
4413 {
4414         struct zsched_arg zarg;
4415         nvlist_t *rctls = NULL;
4416         proc_t *pp = curproc;
4417         zone_t *zone, *ztmp;
4418         zoneid_t zoneid, start = GLOBAL_ZONEID;
4419         int error;
4420         int error2 = 0;
4421         char *str;
4422         cred_t *zkcr;
4423         boolean_t insert_label_hash;
4424 
4425         if (secpolicy_zone_config(CRED()) != 0)
4426                 return (set_errno(EPERM));
4427 
4428         /* can't boot zone from within chroot environment */
4429         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4430                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4431                     extended_error));
4432 
4433         /*
4434          * As the first step of zone creation, we want to allocate a zoneid.
4435          * This allocation is complicated by the fact that netstacks use the
4436          * zoneid to determine their stackid, but netstacks themselves are
4437          * freed asynchronously with respect to zone destruction.  This means
4438          * that a netstack reference leak (or in principle, an extraordinarily
4439          * long netstack reference hold) could result in a zoneid being
4440          * allocated that in fact corresponds to a stackid from an active
4441          * (referenced) netstack -- unleashing all sorts of havoc when that
4442          * netstack is actually (re)used.  (In the abstract, we might wish a
4443          * zoneid to not be deallocated until its last referencing netstack
4444          * has been released, but netstacks lack a backpointer into their
4445          * referencing zone -- and changing them to have such a pointer would
4446          * be substantial, to put it euphemistically.)  To avoid this, we
4447          * detect this condition on allocation: if we have allocated a zoneid
4448          * that corresponds to a netstack that's still in use, we warn about
4449          * it (as it is much more likely to be a reference leak than an actual
4450          * netstack reference), free it, and allocate another.  That these
4451          * identifers are allocated out of an ID space assures that we won't
4452          * see the identifier we just allocated.
4453          */
4454         for (;;) {
4455                 zoneid = id_alloc(zoneid_space);
4456 
4457                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4458                         break;
4459 
4460                 id_free(zoneid_space, zoneid);
4461 
4462                 if (start == GLOBAL_ZONEID) {
4463                         start = zoneid;
4464                 } else if (zoneid == start) {
4465                         /*
4466                          * We have managed to iterate over the entire available
4467                          * zoneid space -- there are no identifiers available,
4468                          * presumably due to some number of leaked netstack
4469                          * references.  While it's in principle possible for us
4470                          * to continue to try, it seems wiser to give up at
4471                          * this point to warn and fail explicitly with a
4472                          * distinctive error.
4473                          */
4474                         cmn_err(CE_WARN, "zone_create() failed: all available "
4475                             "zone IDs have netstacks still in use");
4476                         return (set_errno(ENFILE));
4477                 }
4478 
4479                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4480                     "netstack still in use", zoneid);
4481         }
4482 
4483         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4484 
4485         zone->zone_id = zoneid;
4486         zone->zone_did = zone_did;
4487         zone->zone_status = ZONE_IS_UNINITIALIZED;
4488         zone->zone_pool = pool_default;
4489         zone->zone_pool_mod = gethrtime();
4490         zone->zone_psetid = ZONE_PS_INVAL;
4491         zone->zone_ncpus = 0;
4492         zone->zone_ncpus_online = 0;
4493         zone->zone_restart_init = B_TRUE;
4494         zone->zone_brand = &native_brand;
4495         zone->zone_initname = NULL;
4496         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4497         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4498         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4499         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4500         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4501             offsetof(zone_ref_t, zref_linkage));
4502         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4503             offsetof(struct zsd_entry, zsd_linkage));
4504         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4505             offsetof(zone_dataset_t, zd_linkage));
4506         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4507             offsetof(zone_dl_t, zdl_linkage));
4508         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4509         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4510 
4511         if (flags & ZCF_NET_EXCL) {
4512                 zone->zone_flags |= ZF_NET_EXCL;
4513         }
4514 
4515         if ((error = zone_set_name(zone, zone_name)) != 0) {
4516                 zone_free(zone);
4517                 return (zone_create_error(error, 0, extended_error));
4518         }
4519 
4520         if ((error = zone_set_root(zone, zone_root)) != 0) {
4521                 zone_free(zone);
4522                 return (zone_create_error(error, 0, extended_error));
4523         }
4524         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4525                 zone_free(zone);
4526                 return (zone_create_error(error, 0, extended_error));
4527         }
4528 
4529         /* initialize node name to be the same as zone name */
4530         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4531         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4532         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4533 
4534         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4535         zone->zone_domain[0] = '\0';
4536         zone->zone_hostid = HW_INVALID_HOSTID;
4537         zone->zone_shares = 1;
4538         zone->zone_shmmax = 0;
4539         zone->zone_ipc.ipcq_shmmni = 0;
4540         zone->zone_ipc.ipcq_semmni = 0;
4541         zone->zone_ipc.ipcq_msgmni = 0;
4542         zone->zone_bootargs = NULL;
4543         zone->zone_fs_allowed = NULL;
4544 
4545         psecflags_default(&zone->zone_secflags);
4546 
4547         zone->zone_initname =
4548             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4549         (void) strcpy(zone->zone_initname, zone_default_initname);
4550         zone->zone_nlwps = 0;
4551         zone->zone_nlwps_ctl = INT_MAX;
4552         zone->zone_nprocs = 0;
4553         zone->zone_nprocs_ctl = INT_MAX;
4554         zone->zone_locked_mem = 0;
4555         zone->zone_locked_mem_ctl = UINT64_MAX;
4556         zone->zone_max_swap = 0;
4557         zone->zone_max_swap_ctl = UINT64_MAX;
4558         zone->zone_max_lofi = 0;
4559         zone->zone_max_lofi_ctl = UINT64_MAX;
4560         zone0.zone_lockedmem_kstat = NULL;
4561         zone0.zone_swapresv_kstat = NULL;
4562 
4563         zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
4564 
4565         /*
4566          * Zsched initializes the rctls.
4567          */
4568         zone->zone_rctls = NULL;
4569 
4570         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4571                 zone_free(zone);
4572                 return (zone_create_error(error, 0, extended_error));
4573         }
4574 
4575         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4576                 zone_free(zone);
4577                 return (set_errno(error));
4578         }
4579 
4580         /*
4581          * Read in the trusted system parameters:
4582          * match flag and sensitivity label.
4583          */
4584         zone->zone_match = match;
4585         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4586                 /* Fail if requested to set doi to anything but system's doi */
4587                 if (doi != 0 && doi != default_doi) {
4588                         zone_free(zone);
4589                         return (set_errno(EINVAL));
4590                 }
4591                 /* Always apply system's doi to the zone */
4592                 error = zone_set_label(zone, label, default_doi);
4593                 if (error != 0) {
4594                         zone_free(zone);
4595                         return (set_errno(error));
4596                 }
4597                 insert_label_hash = B_TRUE;
4598         } else {
4599                 /* all zones get an admin_low label if system is not labeled */
4600                 zone->zone_slabel = l_admin_low;
4601                 label_hold(l_admin_low);
4602                 insert_label_hash = B_FALSE;
4603         }
4604 
4605         /*
4606          * Stop all lwps since that's what normally happens as part of fork().
4607          * This needs to happen before we grab any locks to avoid deadlock
4608          * (another lwp in the process could be waiting for the held lock).
4609          */
4610         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4611                 zone_free(zone);
4612                 nvlist_free(rctls);
4613                 return (zone_create_error(error, 0, extended_error));
4614         }
4615 
4616         if (block_mounts(zone) == 0) {
4617                 mutex_enter(&pp->p_lock);
4618                 if (curthread != pp->p_agenttp)
4619                         continuelwps(pp);
4620                 mutex_exit(&pp->p_lock);
4621                 zone_free(zone);
4622                 nvlist_free(rctls);
4623                 return (zone_create_error(error, 0, extended_error));
4624         }
4625 
4626         /*
4627          * Set up credential for kernel access.  After this, any errors
4628          * should go through the dance in errout rather than calling
4629          * zone_free directly.
4630          */
4631         zone->zone_kcred = crdup(kcred);
4632         crsetzone(zone->zone_kcred, zone);
4633         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4634         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4635         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4636         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4637 
4638         mutex_enter(&zonehash_lock);
4639         /*
4640          * Make sure zone doesn't already exist.
4641          *
4642          * If the system and zone are labeled,
4643          * make sure no other zone exists that has the same label.
4644          */
4645         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4646             (insert_label_hash &&
4647             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4648                 zone_status_t status;
4649 
4650                 status = zone_status_get(ztmp);
4651                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4652                         error = EEXIST;
4653                 else
4654                         error = EBUSY;
4655 
4656                 if (insert_label_hash)
4657                         error2 = ZE_LABELINUSE;
4658 
4659                 goto errout;
4660         }
4661 
4662         /*
4663          * Don't allow zone creations which would cause one zone's rootpath to
4664          * be accessible from that of another (non-global) zone.
4665          */
4666         if (zone_is_nested(zone->zone_rootpath)) {
4667                 error = EBUSY;
4668                 goto errout;
4669         }
4670 
4671         ASSERT(zonecount != 0);         /* check for leaks */
4672         if (zonecount + 1 > maxzones) {
4673                 error = ENOMEM;
4674                 goto errout;
4675         }
4676 
4677         if (zone_mount_count(zone->zone_rootpath) != 0) {
4678                 error = EBUSY;
4679                 error2 = ZE_AREMOUNTS;
4680                 goto errout;
4681         }
4682 
4683         /*
4684          * Zone is still incomplete, but we need to drop all locks while
4685          * zsched() initializes this zone's kernel process.  We
4686          * optimistically add the zone to the hashtable and associated
4687          * lists so a parallel zone_create() doesn't try to create the
4688          * same zone.
4689          */
4690         zonecount++;
4691         (void) mod_hash_insert(zonehashbyid,
4692             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4693             (mod_hash_val_t)(uintptr_t)zone);
4694         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4695         (void) strcpy(str, zone->zone_name);
4696         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4697             (mod_hash_val_t)(uintptr_t)zone);
4698         if (insert_label_hash) {
4699                 (void) mod_hash_insert(zonehashbylabel,
4700                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4701                 zone->zone_flags |= ZF_HASHED_LABEL;
4702         }
4703 
4704         /*
4705          * Insert into active list.  At this point there are no 'hold's
4706          * on the zone, but everyone else knows not to use it, so we can
4707          * continue to use it.  zsched() will do a zone_hold() if the
4708          * newproc() is successful.
4709          */
4710         list_insert_tail(&zone_active, zone);
4711         mutex_exit(&zonehash_lock);
4712 
4713         zarg.zone = zone;
4714         zarg.nvlist = rctls;
4715         /*
4716          * The process, task, and project rctls are probably wrong;
4717          * we need an interface to get the default values of all rctls,
4718          * and initialize zsched appropriately.  I'm not sure that that
4719          * makes much of a difference, though.
4720          */
4721         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4722         if (error != 0) {
4723                 /*
4724                  * We need to undo all globally visible state.
4725                  */
4726                 mutex_enter(&zonehash_lock);
4727                 list_remove(&zone_active, zone);
4728                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4729                         ASSERT(zone->zone_slabel != NULL);
4730                         (void) mod_hash_destroy(zonehashbylabel,
4731                             (mod_hash_key_t)zone->zone_slabel);
4732                 }
4733                 (void) mod_hash_destroy(zonehashbyname,
4734                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4735                 (void) mod_hash_destroy(zonehashbyid,
4736                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4737                 ASSERT(zonecount > 1);
4738                 zonecount--;
4739                 goto errout;
4740         }
4741 
4742         /*
4743          * Zone creation can't fail from now on.
4744          */
4745 
4746         /*
4747          * Create zone kstats
4748          */
4749         zone_kstat_create(zone);
4750 
4751         /*
4752          * Let the other lwps continue.
4753          */
4754         mutex_enter(&pp->p_lock);
4755         if (curthread != pp->p_agenttp)
4756                 continuelwps(pp);
4757         mutex_exit(&pp->p_lock);
4758 
4759         /*
4760          * Wait for zsched to finish initializing the zone.
4761          */
4762         zone_status_wait(zone, ZONE_IS_READY);
4763         /*
4764          * The zone is fully visible, so we can let mounts progress.
4765          */
4766         resume_mounts(zone);
4767         nvlist_free(rctls);
4768 
4769         return (zoneid);
4770 
4771 errout:
4772         mutex_exit(&zonehash_lock);
4773         /*
4774          * Let the other lwps continue.
4775          */
4776         mutex_enter(&pp->p_lock);
4777         if (curthread != pp->p_agenttp)
4778                 continuelwps(pp);
4779         mutex_exit(&pp->p_lock);
4780 
4781         resume_mounts(zone);
4782         nvlist_free(rctls);
4783         /*
4784          * There is currently one reference to the zone, a cred_ref from
4785          * zone_kcred.  To free the zone, we call crfree, which will call
4786          * zone_cred_rele, which will call zone_free.
4787          */
4788         ASSERT(zone->zone_cred_ref == 1);
4789         ASSERT(zone->zone_kcred->cr_ref == 1);
4790         ASSERT(zone->zone_ref == 0);
4791         zkcr = zone->zone_kcred;
4792         zone->zone_kcred = NULL;
4793         crfree(zkcr);                           /* triggers call to zone_free */
4794         return (zone_create_error(error, error2, extended_error));
4795 }
4796 
4797 /*
4798  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4799  * the heavy lifting.  initname is the path to the program to launch
4800  * at the "top" of the zone; if this is NULL, we use the system default,
4801  * which is stored at zone_default_initname.
4802  */
4803 static int
4804 zone_boot(zoneid_t zoneid)
4805 {
4806         int err;
4807         zone_t *zone;
4808 
4809         if (secpolicy_zone_config(CRED()) != 0)
4810                 return (set_errno(EPERM));
4811         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4812                 return (set_errno(EINVAL));
4813 
4814         mutex_enter(&zonehash_lock);
4815         /*
4816          * Look for zone under hash lock to prevent races with calls to
4817          * zone_shutdown, zone_destroy, etc.
4818          */
4819         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4820                 mutex_exit(&zonehash_lock);
4821                 return (set_errno(EINVAL));
4822         }
4823 
4824         mutex_enter(&zone_status_lock);
4825         if (zone_status_get(zone) != ZONE_IS_READY) {
4826                 mutex_exit(&zone_status_lock);
4827                 mutex_exit(&zonehash_lock);
4828                 return (set_errno(EINVAL));
4829         }
4830         zone_status_set(zone, ZONE_IS_BOOTING);
4831         mutex_exit(&zone_status_lock);
4832 
4833         zone_hold(zone);        /* so we can use the zone_t later */
4834         mutex_exit(&zonehash_lock);
4835 
4836         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4837                 zone_rele(zone);
4838                 return (set_errno(EINTR));
4839         }
4840 
4841         /*
4842          * Boot (starting init) might have failed, in which case the zone
4843          * will go to the SHUTTING_DOWN state; an appropriate errno will
4844          * be placed in zone->zone_boot_err, and so we return that.
4845          */
4846         err = zone->zone_boot_err;
4847         zone_rele(zone);
4848         return (err ? set_errno(err) : 0);
4849 }
4850 
4851 /*
4852  * Kills all user processes in the zone, waiting for them all to exit
4853  * before returning.
4854  */
4855 static int
4856 zone_empty(zone_t *zone)
4857 {
4858         int waitstatus;
4859 
4860         /*
4861          * We need to drop zonehash_lock before killing all
4862          * processes, otherwise we'll deadlock with zone_find_*
4863          * which can be called from the exit path.
4864          */
4865         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4866         while ((waitstatus = zone_status_timedwait_sig(zone,
4867             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4868                 killall(zone->zone_id);
4869         }
4870         /*
4871          * return EINTR if we were signaled
4872          */
4873         if (waitstatus == 0)
4874                 return (EINTR);
4875         return (0);
4876 }
4877 
4878 /*
4879  * This function implements the policy for zone visibility.
4880  *
4881  * In standard Solaris, a non-global zone can only see itself.
4882  *
4883  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4884  * it dominates. For this test, the label of the global zone is treated as
4885  * admin_high so it is special-cased instead of being checked for dominance.
4886  *
4887  * Returns true if zone attributes are viewable, false otherwise.
4888  */
4889 static boolean_t
4890 zone_list_access(zone_t *zone)
4891 {
4892 
4893         if (curproc->p_zone == global_zone ||
4894             curproc->p_zone == zone) {
4895                 return (B_TRUE);
4896         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4897                 bslabel_t *curproc_label;
4898                 bslabel_t *zone_label;
4899 
4900                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4901                 zone_label = label2bslabel(zone->zone_slabel);
4902 
4903                 if (zone->zone_id != GLOBAL_ZONEID &&
4904                     bldominates(curproc_label, zone_label)) {
4905                         return (B_TRUE);
4906                 } else {
4907                         return (B_FALSE);
4908                 }
4909         } else {
4910                 return (B_FALSE);
4911         }
4912 }
4913 
4914 /*
4915  * Systemcall to start the zone's halt sequence.  By the time this
4916  * function successfully returns, all user processes and kernel threads
4917  * executing in it will have exited, ZSD shutdown callbacks executed,
4918  * and the zone status set to ZONE_IS_DOWN.
4919  *
4920  * It is possible that the call will interrupt itself if the caller is the
4921  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4922  */
4923 static int
4924 zone_shutdown(zoneid_t zoneid)
4925 {
4926         int error;
4927         zone_t *zone;
4928         zone_status_t status;
4929 
4930         if (secpolicy_zone_config(CRED()) != 0)
4931                 return (set_errno(EPERM));
4932         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4933                 return (set_errno(EINVAL));
4934 
4935         mutex_enter(&zonehash_lock);
4936         /*
4937          * Look for zone under hash lock to prevent races with other
4938          * calls to zone_shutdown and zone_destroy.
4939          */
4940         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4941                 mutex_exit(&zonehash_lock);
4942                 return (set_errno(EINVAL));
4943         }
4944 
4945         /*
4946          * We have to drop zonehash_lock before calling block_mounts.
4947          * Hold the zone so we can continue to use the zone_t.
4948          */
4949         zone_hold(zone);
4950         mutex_exit(&zonehash_lock);
4951 
4952         /*
4953          * Block mounts so that VFS_MOUNT() can get an accurate view of
4954          * the zone's status with regards to ZONE_IS_SHUTTING down.
4955          *
4956          * e.g. NFS can fail the mount if it determines that the zone
4957          * has already begun the shutdown sequence.
4958          *
4959          */
4960         if (block_mounts(zone) == 0) {
4961                 zone_rele(zone);
4962                 return (set_errno(EINTR));
4963         }
4964 
4965         mutex_enter(&zonehash_lock);
4966         mutex_enter(&zone_status_lock);
4967         status = zone_status_get(zone);
4968         /*
4969          * Fail if the zone isn't fully initialized yet.
4970          */
4971         if (status < ZONE_IS_READY) {
4972                 mutex_exit(&zone_status_lock);
4973                 mutex_exit(&zonehash_lock);
4974                 resume_mounts(zone);
4975                 zone_rele(zone);
4976                 return (set_errno(EINVAL));
4977         }
4978         /*
4979          * If conditions required for zone_shutdown() to return have been met,
4980          * return success.
4981          */
4982         if (status >= ZONE_IS_DOWN) {
4983                 mutex_exit(&zone_status_lock);
4984                 mutex_exit(&zonehash_lock);
4985                 resume_mounts(zone);
4986                 zone_rele(zone);
4987                 return (0);
4988         }
4989         /*
4990          * If zone_shutdown() hasn't been called before, go through the motions.
4991          * If it has, there's nothing to do but wait for the kernel threads to
4992          * drain.
4993          */
4994         if (status < ZONE_IS_EMPTY) {
4995                 uint_t ntasks;
4996 
4997                 mutex_enter(&zone->zone_lock);
4998                 if ((ntasks = zone->zone_ntasks) != 1) {
4999                         /*
5000                          * There's still stuff running.
5001                          */
5002                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5003                 }
5004                 mutex_exit(&zone->zone_lock);
5005                 if (ntasks == 1) {
5006                         /*
5007                          * The only way to create another task is through
5008                          * zone_enter(), which will block until we drop
5009                          * zonehash_lock.  The zone is empty.
5010                          */
5011                         if (zone->zone_kthreads == NULL) {
5012                                 /*
5013                                  * Skip ahead to ZONE_IS_DOWN
5014                                  */
5015                                 zone_status_set(zone, ZONE_IS_DOWN);
5016                         } else {
5017                                 zone_status_set(zone, ZONE_IS_EMPTY);
5018                         }
5019                 }
5020         }
5021         mutex_exit(&zone_status_lock);
5022         mutex_exit(&zonehash_lock);
5023         resume_mounts(zone);
5024 
5025         if (error = zone_empty(zone)) {
5026                 zone_rele(zone);
5027                 return (set_errno(error));
5028         }
5029         /*
5030          * After the zone status goes to ZONE_IS_DOWN this zone will no
5031          * longer be notified of changes to the pools configuration, so
5032          * in order to not end up with a stale pool pointer, we point
5033          * ourselves at the default pool and remove all resource
5034          * visibility.  This is especially important as the zone_t may
5035          * languish on the deathrow for a very long time waiting for
5036          * cred's to drain out.
5037          *
5038          * This rebinding of the zone can happen multiple times
5039          * (presumably due to interrupted or parallel systemcalls)
5040          * without any adverse effects.
5041          */
5042         if (pool_lock_intr() != 0) {
5043                 zone_rele(zone);
5044                 return (set_errno(EINTR));
5045         }
5046         if (pool_state == POOL_ENABLED) {
5047                 mutex_enter(&cpu_lock);
5048                 zone_pool_set(zone, pool_default);
5049                 /*
5050                  * The zone no longer needs to be able to see any cpus.
5051                  */
5052                 zone_pset_set(zone, ZONE_PS_INVAL);
5053                 mutex_exit(&cpu_lock);
5054         }
5055         pool_unlock();
5056 
5057         /*
5058          * ZSD shutdown callbacks can be executed multiple times, hence
5059          * it is safe to not be holding any locks across this call.
5060          */
5061         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5062 
5063         mutex_enter(&zone_status_lock);
5064         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5065                 zone_status_set(zone, ZONE_IS_DOWN);
5066         mutex_exit(&zone_status_lock);
5067 
5068         /*
5069          * Wait for kernel threads to drain.
5070          */
5071         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5072                 zone_rele(zone);
5073                 return (set_errno(EINTR));
5074         }
5075 
5076         /*
5077          * Zone can be become down/destroyable even if the above wait
5078          * returns EINTR, so any code added here may never execute.
5079          * (i.e. don't add code here)
5080          */
5081 
5082         zone_rele(zone);
5083         return (0);
5084 }
5085 
5086 /*
5087  * Log the specified zone's reference counts.  The caller should not be
5088  * holding the zone's zone_lock.
5089  */
5090 static void
5091 zone_log_refcounts(zone_t *zone)
5092 {
5093         char *buffer;
5094         char *buffer_position;
5095         uint32_t buffer_size;
5096         uint32_t index;
5097         uint_t ref;
5098         uint_t cred_ref;
5099 
5100         /*
5101          * Construct a string representing the subsystem-specific reference
5102          * counts.  The counts are printed in ascending order by index into the
5103          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5104          * square brackets [] and will only contain nonzero reference counts.
5105          *
5106          * The buffer will hold two square bracket characters plus ten digits,
5107          * one colon, one space, one comma, and some characters for a
5108          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5109          * bit integers have at most ten decimal digits.)  The last
5110          * reference count's comma is replaced by the closing square
5111          * bracket and a NULL character to terminate the string.
5112          *
5113          * NOTE: We have to grab the zone's zone_lock to create a consistent
5114          * snapshot of the zone's reference counters.
5115          *
5116          * First, figure out how much space the string buffer will need.
5117          * The buffer's size is stored in buffer_size.
5118          */
5119         buffer_size = 2;                        /* for the square brackets */
5120         mutex_enter(&zone->zone_lock);
5121         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5122         ref = zone->zone_ref;
5123         cred_ref = zone->zone_cred_ref;
5124         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5125                 if (zone->zone_subsys_ref[index] != 0)
5126                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5127                             13;
5128         if (buffer_size == 2) {
5129                 /*
5130                  * No subsystems had nonzero reference counts.  Don't bother
5131                  * with allocating a buffer; just log the general-purpose and
5132                  * credential reference counts.
5133                  */
5134                 mutex_exit(&zone->zone_lock);
5135                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5136                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5137                     "references and %u credential references are still extant",
5138                     zone->zone_name, zone->zone_id, ref, cred_ref);
5139                 return;
5140         }
5141 
5142         /*
5143          * buffer_size contains the exact number of characters that the
5144          * buffer will need.  Allocate the buffer and fill it with nonzero
5145          * subsystem-specific reference counts.  Surround the results with
5146          * square brackets afterwards.
5147          */
5148         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5149         buffer_position = &buffer[1];
5150         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5151                 /*
5152                  * NOTE: The DDI's version of sprintf() returns a pointer to
5153                  * the modified buffer rather than the number of bytes written
5154                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5155                  * Therefore, we'll use snprintf() with INT_MAX to get the
5156                  * number of bytes written.  Using INT_MAX is safe because
5157                  * the buffer is perfectly sized for the data: we'll never
5158                  * overrun the buffer.
5159                  */
5160                 if (zone->zone_subsys_ref[index] != 0)
5161                         buffer_position += snprintf(buffer_position, INT_MAX,
5162                             "%s: %u,", zone_ref_subsys_names[index],
5163                             zone->zone_subsys_ref[index]);
5164         }
5165         mutex_exit(&zone->zone_lock);
5166         buffer[0] = '[';
5167         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5168         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5169         buffer_position[-1] = ']';
5170 
5171         /*
5172          * Log the reference counts and free the message buffer.
5173          */
5174         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5175             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5176             "%u credential references are still extant %s", zone->zone_name,
5177             zone->zone_id, ref, cred_ref, buffer);
5178         kmem_free(buffer, buffer_size);
5179 }
5180 
5181 /*
5182  * Systemcall entry point to finalize the zone halt process.  The caller
5183  * must have already successfully called zone_shutdown().
5184  *
5185  * Upon successful completion, the zone will have been fully destroyed:
5186  * zsched will have exited, destructor callbacks executed, and the zone
5187  * removed from the list of active zones.
5188  */
5189 static int
5190 zone_destroy(zoneid_t zoneid)
5191 {
5192         uint64_t uniqid;
5193         zone_t *zone;
5194         zone_status_t status;
5195         clock_t wait_time;
5196         boolean_t log_refcounts;
5197 
5198         if (secpolicy_zone_config(CRED()) != 0)
5199                 return (set_errno(EPERM));
5200         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5201                 return (set_errno(EINVAL));
5202 
5203         mutex_enter(&zonehash_lock);
5204         /*
5205          * Look for zone under hash lock to prevent races with other
5206          * calls to zone_destroy.
5207          */
5208         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5209                 mutex_exit(&zonehash_lock);
5210                 return (set_errno(EINVAL));
5211         }
5212 
5213         if (zone_mount_count(zone->zone_rootpath) != 0) {
5214                 mutex_exit(&zonehash_lock);
5215                 return (set_errno(EBUSY));
5216         }
5217         mutex_enter(&zone_status_lock);
5218         status = zone_status_get(zone);
5219         if (status < ZONE_IS_DOWN) {
5220                 mutex_exit(&zone_status_lock);
5221                 mutex_exit(&zonehash_lock);
5222                 return (set_errno(EBUSY));
5223         } else if (status == ZONE_IS_DOWN) {
5224                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5225         }
5226         mutex_exit(&zone_status_lock);
5227         zone_hold(zone);
5228         mutex_exit(&zonehash_lock);
5229 
5230         /*
5231          * wait for zsched to exit
5232          */
5233         zone_status_wait(zone, ZONE_IS_DEAD);
5234         zone_zsd_callbacks(zone, ZSD_DESTROY);
5235         zone->zone_netstack = NULL;
5236         uniqid = zone->zone_uniqid;
5237         zone_rele(zone);
5238         zone = NULL;    /* potentially free'd */
5239 
5240         log_refcounts = B_FALSE;
5241         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5242         mutex_enter(&zonehash_lock);
5243         for (; /* ever */; ) {
5244                 boolean_t unref;
5245                 boolean_t refs_have_been_logged;
5246 
5247                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5248                     zone->zone_uniqid != uniqid) {
5249                         /*
5250                          * The zone has gone away.  Necessary conditions
5251                          * are met, so we return success.
5252                          */
5253                         mutex_exit(&zonehash_lock);
5254                         return (0);
5255                 }
5256                 mutex_enter(&zone->zone_lock);
5257                 unref = ZONE_IS_UNREF(zone);
5258                 refs_have_been_logged = (zone->zone_flags &
5259                     ZF_REFCOUNTS_LOGGED);
5260                 mutex_exit(&zone->zone_lock);
5261                 if (unref) {
5262                         /*
5263                          * There is only one reference to the zone -- that
5264                          * added when the zone was added to the hashtables --
5265                          * and things will remain this way until we drop
5266                          * zonehash_lock... we can go ahead and cleanup the
5267                          * zone.
5268                          */
5269                         break;
5270                 }
5271 
5272                 /*
5273                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5274                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5275                  * some zone's general-purpose reference count reaches one.
5276                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5277                  * on zone_destroy_cv, then log the zone's reference counts and
5278                  * continue to wait for zone_rele() and zone_cred_rele().
5279                  */
5280                 if (!refs_have_been_logged) {
5281                         if (!log_refcounts) {
5282                                 /*
5283                                  * This thread hasn't timed out waiting on
5284                                  * zone_destroy_cv yet.  Wait wait_time clock
5285                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5286                                  * seconds) for the zone's references to clear.
5287                                  */
5288                                 ASSERT(wait_time > 0);
5289                                 wait_time = cv_reltimedwait_sig(
5290                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5291                                     TR_SEC);
5292                                 if (wait_time > 0) {
5293                                         /*
5294                                          * A thread in zone_rele() or
5295                                          * zone_cred_rele() signaled
5296                                          * zone_destroy_cv before this thread's
5297                                          * wait timed out.  The zone might have
5298                                          * only one reference left; find out!
5299                                          */
5300                                         continue;
5301                                 } else if (wait_time == 0) {
5302                                         /* The thread's process was signaled. */
5303                                         mutex_exit(&zonehash_lock);
5304                                         return (set_errno(EINTR));
5305                                 }
5306 
5307                                 /*
5308                                  * The thread timed out while waiting on
5309                                  * zone_destroy_cv.  Even though the thread
5310                                  * timed out, it has to check whether another
5311                                  * thread woke up from zone_destroy_cv and
5312                                  * destroyed the zone.
5313                                  *
5314                                  * If the zone still exists and has more than
5315                                  * one unreleased general-purpose reference,
5316                                  * then log the zone's reference counts.
5317                                  */
5318                                 log_refcounts = B_TRUE;
5319                                 continue;
5320                         }
5321 
5322                         /*
5323                          * The thread already timed out on zone_destroy_cv while
5324                          * waiting for subsystems to release the zone's last
5325                          * general-purpose references.  Log the zone's reference
5326                          * counts and wait indefinitely on zone_destroy_cv.
5327                          */
5328                         zone_log_refcounts(zone);
5329                 }
5330                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5331                         /* The thread's process was signaled. */
5332                         mutex_exit(&zonehash_lock);
5333                         return (set_errno(EINTR));
5334                 }
5335         }
5336 
5337         /*
5338          * Remove CPU cap for this zone now since we're not going to
5339          * fail below this point.
5340          */
5341         cpucaps_zone_remove(zone);
5342 
5343         /* Get rid of the zone's kstats */
5344         zone_kstat_delete(zone);
5345 
5346         /* remove the pfexecd doors */
5347         if (zone->zone_pfexecd != NULL) {
5348                 klpd_freelist(&zone->zone_pfexecd);
5349                 zone->zone_pfexecd = NULL;
5350         }
5351 
5352         /* free brand specific data */
5353         if (ZONE_IS_BRANDED(zone))
5354                 ZBROP(zone)->b_free_brand_data(zone);
5355 
5356         /* Say goodbye to brand framework. */
5357         brand_unregister_zone(zone->zone_brand);
5358 
5359         /*
5360          * It is now safe to let the zone be recreated; remove it from the
5361          * lists.  The memory will not be freed until the last cred
5362          * reference goes away.
5363          */
5364         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5365         zonecount--;
5366         /* remove from active list and hash tables */
5367         list_remove(&zone_active, zone);
5368         (void) mod_hash_destroy(zonehashbyname,
5369             (mod_hash_key_t)zone->zone_name);
5370         (void) mod_hash_destroy(zonehashbyid,
5371             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5372         if (zone->zone_flags & ZF_HASHED_LABEL)
5373                 (void) mod_hash_destroy(zonehashbylabel,
5374                     (mod_hash_key_t)zone->zone_slabel);
5375         mutex_exit(&zonehash_lock);
5376 
5377         /*
5378          * Release the root vnode; we're not using it anymore.  Nor should any
5379          * other thread that might access it exist.
5380          */
5381         if (zone->zone_rootvp != NULL) {
5382                 VN_RELE(zone->zone_rootvp);
5383                 zone->zone_rootvp = NULL;
5384         }
5385 
5386         /* add to deathrow list */
5387         mutex_enter(&zone_deathrow_lock);
5388         list_insert_tail(&zone_deathrow, zone);
5389         mutex_exit(&zone_deathrow_lock);
5390 
5391         /*
5392          * Drop last reference (which was added by zsched()), this will
5393          * free the zone unless there are outstanding cred references.
5394          */
5395         zone_rele(zone);
5396         return (0);
5397 }
5398 
5399 /*
5400  * Systemcall entry point for zone_getattr(2).
5401  */
5402 static ssize_t
5403 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5404 {
5405         size_t size;
5406         int error = 0, err;
5407         zone_t *zone;
5408         char *zonepath;
5409         char *outstr;
5410         zone_status_t zone_status;
5411         pid_t initpid;
5412         boolean_t global = (curzone == global_zone);
5413         boolean_t inzone = (curzone->zone_id == zoneid);
5414         ushort_t flags;
5415         zone_net_data_t *zbuf;
5416 
5417         mutex_enter(&zonehash_lock);
5418         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5419                 mutex_exit(&zonehash_lock);
5420                 return (set_errno(EINVAL));
5421         }
5422         zone_status = zone_status_get(zone);
5423         if (zone_status < ZONE_IS_INITIALIZED) {
5424                 mutex_exit(&zonehash_lock);
5425                 return (set_errno(EINVAL));
5426         }
5427         zone_hold(zone);
5428         mutex_exit(&zonehash_lock);
5429 
5430         /*
5431          * If not in the global zone, don't show information about other zones,
5432          * unless the system is labeled and the local zone's label dominates
5433          * the other zone.
5434          */
5435         if (!zone_list_access(zone)) {
5436                 zone_rele(zone);
5437                 return (set_errno(EINVAL));
5438         }
5439 
5440         switch (attr) {
5441         case ZONE_ATTR_ROOT:
5442                 if (global) {
5443                         /*
5444                          * Copy the path to trim the trailing "/" (except for
5445                          * the global zone).
5446                          */
5447                         if (zone != global_zone)
5448                                 size = zone->zone_rootpathlen - 1;
5449                         else
5450                                 size = zone->zone_rootpathlen;
5451                         zonepath = kmem_alloc(size, KM_SLEEP);
5452                         bcopy(zone->zone_rootpath, zonepath, size);
5453                         zonepath[size - 1] = '\0';
5454                 } else {
5455                         if (inzone || !is_system_labeled()) {
5456                                 /*
5457                                  * Caller is not in the global zone.
5458                                  * if the query is on the current zone
5459                                  * or the system is not labeled,
5460                                  * just return faked-up path for current zone.
5461                                  */
5462                                 zonepath = "/";
5463                                 size = 2;
5464                         } else {
5465                                 /*
5466                                  * Return related path for current zone.
5467                                  */
5468                                 int prefix_len = strlen(zone_prefix);
5469                                 int zname_len = strlen(zone->zone_name);
5470 
5471                                 size = prefix_len + zname_len + 1;
5472                                 zonepath = kmem_alloc(size, KM_SLEEP);
5473                                 bcopy(zone_prefix, zonepath, prefix_len);
5474                                 bcopy(zone->zone_name, zonepath +
5475                                     prefix_len, zname_len);
5476                                 zonepath[size - 1] = '\0';
5477                         }
5478                 }
5479                 if (bufsize > size)
5480                         bufsize = size;
5481                 if (buf != NULL) {
5482                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5483                         if (err != 0 && err != ENAMETOOLONG)
5484                                 error = EFAULT;
5485                 }
5486                 if (global || (is_system_labeled() && !inzone))
5487                         kmem_free(zonepath, size);
5488                 break;
5489 
5490         case ZONE_ATTR_NAME:
5491                 size = strlen(zone->zone_name) + 1;
5492                 if (bufsize > size)
5493                         bufsize = size;
5494                 if (buf != NULL) {
5495                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5496                         if (err != 0 && err != ENAMETOOLONG)
5497                                 error = EFAULT;
5498                 }
5499                 break;
5500 
5501         case ZONE_ATTR_STATUS:
5502                 /*
5503                  * Since we're not holding zonehash_lock, the zone status
5504                  * may be anything; leave it up to userland to sort it out.
5505                  */
5506                 size = sizeof (zone_status);
5507                 if (bufsize > size)
5508                         bufsize = size;
5509                 zone_status = zone_status_get(zone);
5510                 if (buf != NULL &&
5511                     copyout(&zone_status, buf, bufsize) != 0)
5512                         error = EFAULT;
5513                 break;
5514         case ZONE_ATTR_FLAGS:
5515                 size = sizeof (zone->zone_flags);
5516                 if (bufsize > size)
5517                         bufsize = size;
5518                 flags = zone->zone_flags;
5519                 if (buf != NULL &&
5520                     copyout(&flags, buf, bufsize) != 0)
5521                         error = EFAULT;
5522                 break;
5523         case ZONE_ATTR_PRIVSET:
5524                 size = sizeof (priv_set_t);
5525                 if (bufsize > size)
5526                         bufsize = size;
5527                 if (buf != NULL &&
5528                     copyout(zone->zone_privset, buf, bufsize) != 0)
5529                         error = EFAULT;
5530                 break;
5531         case ZONE_ATTR_UNIQID:
5532                 size = sizeof (zone->zone_uniqid);
5533                 if (bufsize > size)
5534                         bufsize = size;
5535                 if (buf != NULL &&
5536                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5537                         error = EFAULT;
5538                 break;
5539         case ZONE_ATTR_POOLID:
5540                 {
5541                         pool_t *pool;
5542                         poolid_t poolid;
5543 
5544                         if (pool_lock_intr() != 0) {
5545                                 error = EINTR;
5546                                 break;
5547                         }
5548                         pool = zone_pool_get(zone);
5549                         poolid = pool->pool_id;
5550                         pool_unlock();
5551                         size = sizeof (poolid);
5552                         if (bufsize > size)
5553                                 bufsize = size;
5554                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5555                                 error = EFAULT;
5556                 }
5557                 break;
5558         case ZONE_ATTR_SLBL:
5559                 size = sizeof (bslabel_t);
5560                 if (bufsize > size)
5561                         bufsize = size;
5562                 if (zone->zone_slabel == NULL)
5563                         error = EINVAL;
5564                 else if (buf != NULL &&
5565                     copyout(label2bslabel(zone->zone_slabel), buf,
5566                     bufsize) != 0)
5567                         error = EFAULT;
5568                 break;
5569         case ZONE_ATTR_INITPID:
5570                 size = sizeof (initpid);
5571                 if (bufsize > size)
5572                         bufsize = size;
5573                 initpid = zone->zone_proc_initpid;
5574                 if (initpid == -1) {
5575                         error = ESRCH;
5576                         break;
5577                 }
5578                 if (buf != NULL &&
5579                     copyout(&initpid, buf, bufsize) != 0)
5580                         error = EFAULT;
5581                 break;
5582         case ZONE_ATTR_BRAND:
5583                 size = strlen(zone->zone_brand->b_name) + 1;
5584 
5585                 if (bufsize > size)
5586                         bufsize = size;
5587                 if (buf != NULL) {
5588                         err = copyoutstr(zone->zone_brand->b_name, buf,
5589                             bufsize, NULL);
5590                         if (err != 0 && err != ENAMETOOLONG)
5591                                 error = EFAULT;
5592                 }
5593                 break;
5594         case ZONE_ATTR_INITNAME:
5595                 size = strlen(zone->zone_initname) + 1;
5596                 if (bufsize > size)
5597                         bufsize = size;
5598                 if (buf != NULL) {
5599                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5600                             NULL);
5601                         if (err != 0 && err != ENAMETOOLONG)
5602                                 error = EFAULT;
5603                 }
5604                 break;
5605         case ZONE_ATTR_BOOTARGS:
5606                 if (zone->zone_bootargs == NULL)
5607                         outstr = "";
5608                 else
5609                         outstr = zone->zone_bootargs;
5610                 size = strlen(outstr) + 1;
5611                 if (bufsize > size)
5612                         bufsize = size;
5613                 if (buf != NULL) {
5614                         err = copyoutstr(outstr, buf, bufsize, NULL);
5615                         if (err != 0 && err != ENAMETOOLONG)
5616                                 error = EFAULT;
5617                 }
5618                 break;
5619         case ZONE_ATTR_PHYS_MCAP:
5620                 size = sizeof (zone->zone_phys_mcap);
5621                 if (bufsize > size)
5622                         bufsize = size;
5623                 if (buf != NULL &&
5624                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5625                         error = EFAULT;
5626                 break;
5627         case ZONE_ATTR_SCHED_CLASS:
5628                 mutex_enter(&class_lock);
5629 
5630                 if (zone->zone_defaultcid >= loaded_classes)
5631                         outstr = "";
5632                 else
5633                         outstr = sclass[zone->zone_defaultcid].cl_name;
5634                 size = strlen(outstr) + 1;
5635                 if (bufsize > size)
5636                         bufsize = size;
5637                 if (buf != NULL) {
5638                         err = copyoutstr(outstr, buf, bufsize, NULL);
5639                         if (err != 0 && err != ENAMETOOLONG)
5640                                 error = EFAULT;
5641                 }
5642 
5643                 mutex_exit(&class_lock);
5644                 break;
5645         case ZONE_ATTR_HOSTID:
5646                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5647                     bufsize == sizeof (zone->zone_hostid)) {
5648                         size = sizeof (zone->zone_hostid);
5649                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5650                             bufsize) != 0)
5651                                 error = EFAULT;
5652                 } else {
5653                         error = EINVAL;
5654                 }
5655                 break;
5656         case ZONE_ATTR_FS_ALLOWED:
5657                 if (zone->zone_fs_allowed == NULL)
5658                         outstr = "";
5659                 else
5660                         outstr = zone->zone_fs_allowed;
5661                 size = strlen(outstr) + 1;
5662                 if (bufsize > size)
5663                         bufsize = size;
5664                 if (buf != NULL) {
5665                         err = copyoutstr(outstr, buf, bufsize, NULL);
5666                         if (err != 0 && err != ENAMETOOLONG)
5667                                 error = EFAULT;
5668                 }
5669                 break;
5670         case ZONE_ATTR_SECFLAGS:
5671                 size = sizeof (zone->zone_secflags);
5672                 if (bufsize > size)
5673                         bufsize = size;
5674                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5675                         error = EFAULT;
5676                 break;
5677         case ZONE_ATTR_NETWORK:
5678                 bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5679                 size = bufsize;
5680                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5681                 if (copyin(buf, zbuf, bufsize) != 0) {
5682                         error = EFAULT;
5683                 } else {
5684                         error = zone_get_network(zoneid, zbuf);
5685                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5686                                 error = EFAULT;
5687                 }
5688                 kmem_free(zbuf, bufsize);
5689                 break;
5690         case ZONE_ATTR_DID:
5691                 size = sizeof (zoneid_t);
5692                 if (bufsize > size)
5693                         bufsize = size;
5694 
5695                 if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
5696                         error = EFAULT;
5697                 break;
5698         default:
5699                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5700                         size = bufsize;
5701                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5702                 } else {
5703                         error = EINVAL;
5704                 }
5705         }
5706         zone_rele(zone);
5707 
5708         if (error)
5709                 return (set_errno(error));
5710         return ((ssize_t)size);
5711 }
5712 
5713 /*
5714  * Systemcall entry point for zone_setattr(2).
5715  */
5716 /*ARGSUSED*/
5717 static int
5718 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5719 {
5720         zone_t *zone;
5721         zone_status_t zone_status;
5722         int err = -1;
5723         zone_net_data_t *zbuf;
5724 
5725         if (secpolicy_zone_config(CRED()) != 0)
5726                 return (set_errno(EPERM));
5727 
5728         /*
5729          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5730          * global zone.
5731          */
5732         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5733                 return (set_errno(EINVAL));
5734         }
5735 
5736         mutex_enter(&zonehash_lock);
5737         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5738                 mutex_exit(&zonehash_lock);
5739                 return (set_errno(EINVAL));
5740         }
5741         zone_hold(zone);
5742         mutex_exit(&zonehash_lock);
5743 
5744         /*
5745          * At present most attributes can only be set on non-running,
5746          * non-global zones.
5747          */
5748         zone_status = zone_status_get(zone);
5749         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5750                 err = EINVAL;
5751                 goto done;
5752         }
5753 
5754         switch (attr) {
5755         case ZONE_ATTR_INITNAME:
5756                 err = zone_set_initname(zone, (const char *)buf);
5757                 break;
5758         case ZONE_ATTR_INITNORESTART:
5759                 zone->zone_restart_init = B_FALSE;
5760                 err = 0;
5761                 break;
5762         case ZONE_ATTR_BOOTARGS:
5763                 err = zone_set_bootargs(zone, (const char *)buf);
5764                 break;
5765         case ZONE_ATTR_BRAND:
5766                 err = zone_set_brand(zone, (const char *)buf);
5767                 break;
5768         case ZONE_ATTR_FS_ALLOWED:
5769                 err = zone_set_fs_allowed(zone, (const char *)buf);
5770                 break;
5771         case ZONE_ATTR_SECFLAGS:
5772                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5773                 break;
5774         case ZONE_ATTR_PHYS_MCAP:
5775                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5776                 break;
5777         case ZONE_ATTR_SCHED_CLASS:
5778                 err = zone_set_sched_class(zone, (const char *)buf);
5779                 break;
5780         case ZONE_ATTR_HOSTID:
5781                 if (bufsize == sizeof (zone->zone_hostid)) {
5782                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5783                                 err = 0;
5784                         else
5785                                 err = EFAULT;
5786                 } else {
5787                         err = EINVAL;
5788                 }
5789                 break;
5790         case ZONE_ATTR_NETWORK:
5791                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5792                         err = EINVAL;
5793                         break;
5794                 }
5795                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5796                 if (copyin(buf, zbuf, bufsize) != 0) {
5797                         kmem_free(zbuf, bufsize);
5798                         err = EFAULT;
5799                         break;
5800                 }
5801                 err = zone_set_network(zoneid, zbuf);
5802                 kmem_free(zbuf, bufsize);
5803                 break;
5804         default:
5805                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5806                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5807                 else
5808                         err = EINVAL;
5809         }
5810 
5811 done:
5812         zone_rele(zone);
5813         ASSERT(err != -1);
5814         return (err != 0 ? set_errno(err) : 0);
5815 }
5816 
5817 /*
5818  * Return zero if the process has at least one vnode mapped in to its
5819  * address space which shouldn't be allowed to change zones.
5820  *
5821  * Also return zero if the process has any shared mappings which reserve
5822  * swap.  This is because the counting for zone.max-swap does not allow swap
5823  * reservation to be shared between zones.  zone swap reservation is counted
5824  * on zone->zone_max_swap.
5825  */
5826 static int
5827 as_can_change_zones(void)
5828 {
5829         proc_t *pp = curproc;
5830         struct seg *seg;
5831         struct as *as = pp->p_as;
5832         vnode_t *vp;
5833         int allow = 1;
5834 
5835         ASSERT(pp->p_as != &kas);
5836         AS_LOCK_ENTER(as, RW_READER);
5837         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5838 
5839                 /*
5840                  * Cannot enter zone with shared anon memory which
5841                  * reserves swap.  See comment above.
5842                  */
5843                 if (seg_can_change_zones(seg) == B_FALSE) {
5844                         allow = 0;
5845                         break;
5846                 }
5847                 /*
5848                  * if we can't get a backing vnode for this segment then skip
5849                  * it.
5850                  */
5851                 vp = NULL;
5852                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5853                         continue;
5854                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5855                         allow = 0;
5856                         break;
5857                 }
5858         }
5859         AS_LOCK_EXIT(as);
5860         return (allow);
5861 }
5862 
5863 /*
5864  * Count swap reserved by curproc's address space
5865  */
5866 static size_t
5867 as_swresv(void)
5868 {
5869         proc_t *pp = curproc;
5870         struct seg *seg;
5871         struct as *as = pp->p_as;
5872         size_t swap = 0;
5873 
5874         ASSERT(pp->p_as != &kas);
5875         ASSERT(AS_WRITE_HELD(as));
5876         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5877                 swap += seg_swresv(seg);
5878 
5879         return (swap);
5880 }
5881 
5882 /*
5883  * Systemcall entry point for zone_enter().
5884  *
5885  * The current process is injected into said zone.  In the process
5886  * it will change its project membership, privileges, rootdir/cwd,
5887  * zone-wide rctls, and pool association to match those of the zone.
5888  *
5889  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5890  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5891  * enter a zone that is "ready" or "running".
5892  */
5893 static int
5894 zone_enter(zoneid_t zoneid)
5895 {
5896         zone_t *zone;
5897         vnode_t *vp;
5898         proc_t *pp = curproc;
5899         contract_t *ct;
5900         cont_process_t *ctp;
5901         task_t *tk, *oldtk;
5902         kproject_t *zone_proj0;
5903         cred_t *cr, *newcr;
5904         pool_t *oldpool, *newpool;
5905         sess_t *sp;
5906         uid_t uid;
5907         zone_status_t status;
5908         int err = 0;
5909         rctl_entity_p_t e;
5910         size_t swap;
5911         kthread_id_t t;
5912 
5913         if (secpolicy_zone_config(CRED()) != 0)
5914                 return (set_errno(EPERM));
5915         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5916                 return (set_errno(EINVAL));
5917 
5918         /*
5919          * Stop all lwps so we don't need to hold a lock to look at
5920          * curproc->p_zone.  This needs to happen before we grab any
5921          * locks to avoid deadlock (another lwp in the process could
5922          * be waiting for the held lock).
5923          */
5924         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5925                 return (set_errno(EINTR));
5926 
5927         /*
5928          * Make sure we're not changing zones with files open or mapped in
5929          * to our address space which shouldn't be changing zones.
5930          */
5931         if (!files_can_change_zones()) {
5932                 err = EBADF;
5933                 goto out;
5934         }
5935         if (!as_can_change_zones()) {
5936                 err = EFAULT;
5937                 goto out;
5938         }
5939 
5940         mutex_enter(&zonehash_lock);
5941         if (pp->p_zone != global_zone) {
5942                 mutex_exit(&zonehash_lock);
5943                 err = EINVAL;
5944                 goto out;
5945         }
5946 
5947         zone = zone_find_all_by_id(zoneid);
5948         if (zone == NULL) {
5949                 mutex_exit(&zonehash_lock);
5950                 err = EINVAL;
5951                 goto out;
5952         }
5953 
5954         /*
5955          * To prevent processes in a zone from holding contracts on
5956          * extrazonal resources, and to avoid process contract
5957          * memberships which span zones, contract holders and processes
5958          * which aren't the sole members of their encapsulating process
5959          * contracts are not allowed to zone_enter.
5960          */
5961         ctp = pp->p_ct_process;
5962         ct = &ctp->conp_contract;
5963         mutex_enter(&ct->ct_lock);
5964         mutex_enter(&pp->p_lock);
5965         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5966                 mutex_exit(&pp->p_lock);
5967                 mutex_exit(&ct->ct_lock);
5968                 mutex_exit(&zonehash_lock);
5969                 err = EINVAL;
5970                 goto out;
5971         }
5972 
5973         /*
5974          * Moreover, we don't allow processes whose encapsulating
5975          * process contracts have inherited extrazonal contracts.
5976          * While it would be easier to eliminate all process contracts
5977          * with inherited contracts, we need to be able to give a
5978          * restarted init (or other zone-penetrating process) its
5979          * predecessor's contracts.
5980          */
5981         if (ctp->conp_ninherited != 0) {
5982                 contract_t *next;
5983                 for (next = list_head(&ctp->conp_inherited); next;
5984                     next = list_next(&ctp->conp_inherited, next)) {
5985                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5986                                 mutex_exit(&pp->p_lock);
5987                                 mutex_exit(&ct->ct_lock);
5988                                 mutex_exit(&zonehash_lock);
5989                                 err = EINVAL;
5990                                 goto out;
5991                         }
5992                 }
5993         }
5994 
5995         mutex_exit(&pp->p_lock);
5996         mutex_exit(&ct->ct_lock);
5997 
5998         status = zone_status_get(zone);
5999         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6000                 /*
6001                  * Can't join
6002                  */
6003                 mutex_exit(&zonehash_lock);
6004                 err = EINVAL;
6005                 goto out;
6006         }
6007 
6008         /*
6009          * Make sure new priv set is within the permitted set for caller
6010          */
6011         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6012                 mutex_exit(&zonehash_lock);
6013                 err = EPERM;
6014                 goto out;
6015         }
6016         /*
6017          * We want to momentarily drop zonehash_lock while we optimistically
6018          * bind curproc to the pool it should be running in.  This is safe
6019          * since the zone can't disappear (we have a hold on it).
6020          */
6021         zone_hold(zone);
6022         mutex_exit(&zonehash_lock);
6023 
6024         /*
6025          * Grab pool_lock to keep the pools configuration from changing
6026          * and to stop ourselves from getting rebound to another pool
6027          * until we join the zone.
6028          */
6029         if (pool_lock_intr() != 0) {
6030                 zone_rele(zone);
6031                 err = EINTR;
6032                 goto out;
6033         }
6034         ASSERT(secpolicy_pool(CRED()) == 0);
6035         /*
6036          * Bind ourselves to the pool currently associated with the zone.
6037          */
6038         oldpool = curproc->p_pool;
6039         newpool = zone_pool_get(zone);
6040         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6041             (err = pool_do_bind(newpool, P_PID, P_MYID,
6042             POOL_BIND_ALL)) != 0) {
6043                 pool_unlock();
6044                 zone_rele(zone);
6045                 goto out;
6046         }
6047 
6048         /*
6049          * Grab cpu_lock now; we'll need it later when we call
6050          * task_join().
6051          */
6052         mutex_enter(&cpu_lock);
6053         mutex_enter(&zonehash_lock);
6054         /*
6055          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6056          */
6057         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6058                 /*
6059                  * Can't join anymore.
6060                  */
6061                 mutex_exit(&zonehash_lock);
6062                 mutex_exit(&cpu_lock);
6063                 if (pool_state == POOL_ENABLED &&
6064                     newpool != oldpool)
6065                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6066                             POOL_BIND_ALL);
6067                 pool_unlock();
6068                 zone_rele(zone);
6069                 err = EINVAL;
6070                 goto out;
6071         }
6072 
6073         /*
6074          * a_lock must be held while transfering locked memory and swap
6075          * reservation from the global zone to the non global zone because
6076          * asynchronous faults on the processes' address space can lock
6077          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6078          * segments respectively.
6079          */
6080         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6081         swap = as_swresv();
6082         mutex_enter(&pp->p_lock);
6083         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6084         /* verify that we do not exceed and task or lwp limits */
6085         mutex_enter(&zone->zone_nlwps_lock);
6086         /* add new lwps to zone and zone's proj0 */
6087         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6088         zone->zone_nlwps += pp->p_lwpcnt;
6089         /* add 1 task to zone's proj0 */
6090         zone_proj0->kpj_ntasks += 1;
6091 
6092         zone_proj0->kpj_nprocs++;
6093         zone->zone_nprocs++;
6094         mutex_exit(&zone->zone_nlwps_lock);
6095 
6096         mutex_enter(&zone->zone_mem_lock);
6097         zone->zone_locked_mem += pp->p_locked_mem;
6098         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6099         zone->zone_max_swap += swap;
6100         mutex_exit(&zone->zone_mem_lock);
6101 
6102         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6103         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6104         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6105 
6106         /* remove lwps and process from proc's old zone and old project */
6107         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6108         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6109         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6110         pp->p_task->tk_proj->kpj_nprocs--;
6111         pp->p_zone->zone_nprocs--;
6112         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6113 
6114         mutex_enter(&pp->p_zone->zone_mem_lock);
6115         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6116         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6117         pp->p_zone->zone_max_swap -= swap;
6118         mutex_exit(&pp->p_zone->zone_mem_lock);
6119 
6120         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6121         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6122         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6123 
6124         pp->p_flag |= SZONETOP;
6125         pp->p_zone = zone;
6126         mutex_exit(&pp->p_lock);
6127         AS_LOCK_EXIT(pp->p_as);
6128 
6129         /*
6130          * Joining the zone cannot fail from now on.
6131          *
6132          * This means that a lot of the following code can be commonized and
6133          * shared with zsched().
6134          */
6135 
6136         /*
6137          * If the process contract fmri was inherited, we need to
6138          * flag this so that any contract status will not leak
6139          * extra zone information, svc_fmri in this case
6140          */
6141         if (ctp->conp_svc_ctid != ct->ct_id) {
6142                 mutex_enter(&ct->ct_lock);
6143                 ctp->conp_svc_zone_enter = ct->ct_id;
6144                 mutex_exit(&ct->ct_lock);
6145         }
6146 
6147         /*
6148          * Reset the encapsulating process contract's zone.
6149          */
6150         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6151         contract_setzuniqid(ct, zone->zone_uniqid);
6152 
6153         /*
6154          * Create a new task and associate the process with the project keyed
6155          * by (projid,zoneid).
6156          *
6157          * We might as well be in project 0; the global zone's projid doesn't
6158          * make much sense in a zone anyhow.
6159          *
6160          * This also increments zone_ntasks, and returns with p_lock held.
6161          */
6162         tk = task_create(0, zone);
6163         oldtk = task_join(tk, 0);
6164         mutex_exit(&cpu_lock);
6165 
6166         /*
6167          * call RCTLOP_SET functions on this proc
6168          */
6169         e.rcep_p.zone = zone;
6170         e.rcep_t = RCENTITY_ZONE;
6171         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6172             RCD_CALLBACK);
6173         mutex_exit(&pp->p_lock);
6174 
6175         /*
6176          * We don't need to hold any of zsched's locks here; not only do we know
6177          * the process and zone aren't going away, we know its session isn't
6178          * changing either.
6179          *
6180          * By joining zsched's session here, we mimic the behavior in the
6181          * global zone of init's sid being the pid of sched.  We extend this
6182          * to all zlogin-like zone_enter()'ing processes as well.
6183          */
6184         mutex_enter(&pidlock);
6185         sp = zone->zone_zsched->p_sessp;
6186         sess_hold(zone->zone_zsched);
6187         mutex_enter(&pp->p_lock);
6188         pgexit(pp);
6189         sess_rele(pp->p_sessp, B_TRUE);
6190         pp->p_sessp = sp;
6191         pgjoin(pp, zone->zone_zsched->p_pidp);
6192 
6193         /*
6194          * If any threads are scheduled to be placed on zone wait queue they
6195          * should abandon the idea since the wait queue is changing.
6196          * We need to be holding pidlock & p_lock to do this.
6197          */
6198         if ((t = pp->p_tlist) != NULL) {
6199                 do {
6200                         thread_lock(t);
6201                         /*
6202                          * Kick this thread so that it doesn't sit
6203                          * on a wrong wait queue.
6204                          */
6205                         if (ISWAITING(t))
6206                                 setrun_locked(t);
6207 
6208                         if (t->t_schedflag & TS_ANYWAITQ)
6209                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6210 
6211                         thread_unlock(t);
6212                 } while ((t = t->t_forw) != pp->p_tlist);
6213         }
6214 
6215         /*
6216          * If there is a default scheduling class for the zone and it is not
6217          * the class we are currently in, change all of the threads in the
6218          * process to the new class.  We need to be holding pidlock & p_lock
6219          * when we call parmsset so this is a good place to do it.
6220          */
6221         if (zone->zone_defaultcid > 0 &&
6222             zone->zone_defaultcid != curthread->t_cid) {
6223                 pcparms_t pcparms;
6224 
6225                 pcparms.pc_cid = zone->zone_defaultcid;
6226                 pcparms.pc_clparms[0] = 0;
6227 
6228                 /*
6229                  * If setting the class fails, we still want to enter the zone.
6230                  */
6231                 if ((t = pp->p_tlist) != NULL) {
6232                         do {
6233                                 (void) parmsset(&pcparms, t);
6234                         } while ((t = t->t_forw) != pp->p_tlist);
6235                 }
6236         }
6237 
6238         mutex_exit(&pp->p_lock);
6239         mutex_exit(&pidlock);
6240 
6241         mutex_exit(&zonehash_lock);
6242         /*
6243          * We're firmly in the zone; let pools progress.
6244          */
6245         pool_unlock();
6246         task_rele(oldtk);
6247         /*
6248          * We don't need to retain a hold on the zone since we already
6249          * incremented zone_ntasks, so the zone isn't going anywhere.
6250          */
6251         zone_rele(zone);
6252 
6253         /*
6254          * Chroot
6255          */
6256         vp = zone->zone_rootvp;
6257         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6258         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6259 
6260         /*
6261          * Change process security flags.  Note that the _effective_ flags
6262          * cannot change
6263          */
6264         secflags_copy(&pp->p_secflags.psf_lower,
6265             &zone->zone_secflags.psf_lower);
6266         secflags_copy(&pp->p_secflags.psf_upper,
6267             &zone->zone_secflags.psf_upper);
6268         secflags_copy(&pp->p_secflags.psf_inherit,
6269             &zone->zone_secflags.psf_inherit);
6270 
6271         /*
6272          * Change process credentials
6273          */
6274         newcr = cralloc();
6275         mutex_enter(&pp->p_crlock);
6276         cr = pp->p_cred;
6277         crcopy_to(cr, newcr);
6278         crsetzone(newcr, zone);
6279         pp->p_cred = newcr;
6280 
6281         /*
6282          * Restrict all process privilege sets to zone limit
6283          */
6284         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6285         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6286         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6287         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6288         mutex_exit(&pp->p_crlock);
6289         crset(pp, newcr);
6290 
6291         /*
6292          * Adjust upcount to reflect zone entry.
6293          */
6294         uid = crgetruid(newcr);
6295         mutex_enter(&pidlock);
6296         upcount_dec(uid, GLOBAL_ZONEID);
6297         upcount_inc(uid, zoneid);
6298         mutex_exit(&pidlock);
6299 
6300         /*
6301          * Set up core file path and content.
6302          */
6303         set_core_defaults();
6304 
6305 out:
6306         /*
6307          * Let the other lwps continue.
6308          */
6309         mutex_enter(&pp->p_lock);
6310         if (curthread != pp->p_agenttp)
6311                 continuelwps(pp);
6312         mutex_exit(&pp->p_lock);
6313 
6314         return (err != 0 ? set_errno(err) : 0);
6315 }
6316 
6317 /*
6318  * Systemcall entry point for zone_list(2).
6319  *
6320  * Processes running in a (non-global) zone only see themselves.
6321  * On labeled systems, they see all zones whose label they dominate.
6322  */
6323 static int
6324 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6325 {
6326         zoneid_t *zoneids;
6327         zone_t *zone, *myzone;
6328         uint_t user_nzones, real_nzones;
6329         uint_t domi_nzones;
6330         int error;
6331 
6332         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6333                 return (set_errno(EFAULT));
6334 
6335         myzone = curproc->p_zone;
6336         ASSERT(zonecount > 0);
6337         if (myzone != global_zone) {
6338                 bslabel_t *mybslab;
6339 
6340                 if (!is_system_labeled()) {
6341                         /* just return current zone */
6342                         real_nzones = domi_nzones = 1;
6343                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6344                         zoneids[0] = myzone->zone_id;
6345                 } else {
6346                         /* return all zones that are dominated */
6347                         mutex_enter(&zonehash_lock);
6348                         real_nzones = zonecount;
6349                         domi_nzones = 0;
6350                         zoneids = kmem_alloc(real_nzones *
6351                             sizeof (zoneid_t), KM_SLEEP);
6352                         mybslab = label2bslabel(myzone->zone_slabel);
6353                         for (zone = list_head(&zone_active);
6354                             zone != NULL;
6355                             zone = list_next(&zone_active, zone)) {
6356                                 if (zone->zone_id == GLOBAL_ZONEID)
6357                                         continue;
6358                                 if (zone != myzone &&
6359                                     (zone->zone_flags & ZF_IS_SCRATCH))
6360                                         continue;
6361                                 /*
6362                                  * Note that a label always dominates
6363                                  * itself, so myzone is always included
6364                                  * in the list.
6365                                  */
6366                                 if (bldominates(mybslab,
6367                                     label2bslabel(zone->zone_slabel))) {
6368                                         zoneids[domi_nzones++] = zone->zone_id;
6369                                 }
6370                         }
6371                         mutex_exit(&zonehash_lock);
6372                 }
6373         } else {
6374                 mutex_enter(&zonehash_lock);
6375                 real_nzones = zonecount;
6376                 domi_nzones = 0;
6377                 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), KM_SLEEP);
6378                 for (zone = list_head(&zone_active); zone != NULL;
6379                     zone = list_next(&zone_active, zone))
6380                         zoneids[domi_nzones++] = zone->zone_id;
6381 
6382                 ASSERT(domi_nzones == real_nzones);
6383                 mutex_exit(&zonehash_lock);
6384         }
6385 
6386         /*
6387          * If user has allocated space for fewer entries than we found, then
6388          * return only up to their limit.  Either way, tell them exactly how
6389          * many we found.
6390          */
6391         if (domi_nzones < user_nzones)
6392                 user_nzones = domi_nzones;
6393         error = 0;
6394         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6395                 error = EFAULT;
6396         } else if (zoneidlist != NULL && user_nzones != 0) {
6397                 if (copyout(zoneids, zoneidlist,
6398                     user_nzones * sizeof (zoneid_t)) != 0)
6399                         error = EFAULT;
6400         }
6401 
6402         kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6403 
6404         if (error != 0)
6405                 return (set_errno(error));
6406         else
6407                 return (0);
6408 }
6409 
6410 /*
6411  * Systemcall entry point for zone_lookup(2).
6412  *
6413  * Non-global zones are only able to see themselves and (on labeled systems)
6414  * the zones they dominate.
6415  */
6416 static zoneid_t
6417 zone_lookup(const char *zone_name)
6418 {
6419         char *kname;
6420         zone_t *zone;
6421         zoneid_t zoneid;
6422         int err;
6423 
6424         if (zone_name == NULL) {
6425                 /* return caller's zone id */
6426                 return (getzoneid());
6427         }
6428 
6429         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6430         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6431                 kmem_free(kname, ZONENAME_MAX);
6432                 return (set_errno(err));
6433         }
6434 
6435         mutex_enter(&zonehash_lock);
6436         zone = zone_find_all_by_name(kname);
6437         kmem_free(kname, ZONENAME_MAX);
6438         /*
6439          * In a non-global zone, can only lookup global and own name.
6440          * In Trusted Extensions zone label dominance rules apply.
6441          */
6442         if (zone == NULL ||
6443             zone_status_get(zone) < ZONE_IS_READY ||
6444             !zone_list_access(zone)) {
6445                 mutex_exit(&zonehash_lock);
6446                 return (set_errno(EINVAL));
6447         } else {
6448                 zoneid = zone->zone_id;
6449                 mutex_exit(&zonehash_lock);
6450                 return (zoneid);
6451         }
6452 }
6453 
6454 static int
6455 zone_version(int *version_arg)
6456 {
6457         int version = ZONE_SYSCALL_API_VERSION;
6458 
6459         if (copyout(&version, version_arg, sizeof (int)) != 0)
6460                 return (set_errno(EFAULT));
6461         return (0);
6462 }
6463 
6464 /* ARGSUSED */
6465 long
6466 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6467 {
6468         zone_def zs;
6469         int err;
6470 
6471         switch (cmd) {
6472         case ZONE_CREATE:
6473                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6474                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6475                                 return (set_errno(EFAULT));
6476                         }
6477                 } else {
6478 #ifdef _SYSCALL32_IMPL
6479                         zone_def32 zs32;
6480 
6481                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6482                                 return (set_errno(EFAULT));
6483                         }
6484                         zs.zone_name =
6485                             (const char *)(unsigned long)zs32.zone_name;
6486                         zs.zone_root =
6487                             (const char *)(unsigned long)zs32.zone_root;
6488                         zs.zone_privs =
6489                             (const struct priv_set *)
6490                             (unsigned long)zs32.zone_privs;
6491                         zs.zone_privssz = zs32.zone_privssz;
6492                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6493                         zs.rctlbufsz = zs32.rctlbufsz;
6494                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6495                         zs.zfsbufsz = zs32.zfsbufsz;
6496                         zs.extended_error =
6497                             (int *)(unsigned long)zs32.extended_error;
6498                         zs.match = zs32.match;
6499                         zs.doi = zs32.doi;
6500                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6501                         zs.flags = zs32.flags;
6502                         zs.zone_did = zs32.zone_did;
6503 #else
6504                         panic("get_udatamodel() returned bogus result\n");
6505 #endif
6506                 }
6507 
6508                 return (zone_create(zs.zone_name, zs.zone_root,
6509                     zs.zone_privs, zs.zone_privssz,
6510                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6511                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6512                     zs.extended_error, zs.match, zs.doi,
6513                     zs.label, zs.flags, zs.zone_did));
6514         case ZONE_BOOT:
6515                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6516         case ZONE_DESTROY:
6517                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6518         case ZONE_GETATTR:
6519                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6520                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6521         case ZONE_SETATTR:
6522                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6523                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6524         case ZONE_ENTER:
6525                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6526         case ZONE_LIST:
6527                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6528         case ZONE_SHUTDOWN:
6529                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6530         case ZONE_LOOKUP:
6531                 return (zone_lookup((const char *)arg1));
6532         case ZONE_VERSION:
6533                 return (zone_version((int *)arg1));
6534         case ZONE_ADD_DATALINK:
6535                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6536                     (datalink_id_t)(uintptr_t)arg2));
6537         case ZONE_DEL_DATALINK:
6538                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6539                     (datalink_id_t)(uintptr_t)arg2));
6540         case ZONE_CHECK_DATALINK: {
6541                 zoneid_t        zoneid;
6542                 boolean_t       need_copyout;
6543 
6544                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6545                         return (EFAULT);
6546                 need_copyout = (zoneid == ALL_ZONES);
6547                 err = zone_check_datalink(&zoneid,
6548                     (datalink_id_t)(uintptr_t)arg2);
6549                 if (err == 0 && need_copyout) {
6550                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6551                                 err = EFAULT;
6552                 }
6553                 return (err == 0 ? 0 : set_errno(err));
6554         }
6555         case ZONE_LIST_DATALINK:
6556                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6557                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6558         default:
6559                 return (set_errno(EINVAL));
6560         }
6561 }
6562 
6563 struct zarg {
6564         zone_t *zone;
6565         zone_cmd_arg_t arg;
6566 };
6567 
6568 static int
6569 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6570 {
6571         char *buf;
6572         size_t buflen;
6573         int error;
6574 
6575         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6576         buf = kmem_alloc(buflen, KM_SLEEP);
6577         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6578         error = door_ki_open(buf, doorp);
6579         kmem_free(buf, buflen);
6580         return (error);
6581 }
6582 
6583 static void
6584 zone_release_door(door_handle_t *doorp)
6585 {
6586         door_ki_rele(*doorp);
6587         *doorp = NULL;
6588 }
6589 
6590 static void
6591 zone_ki_call_zoneadmd(struct zarg *zargp)
6592 {
6593         door_handle_t door = NULL;
6594         door_arg_t darg, save_arg;
6595         char *zone_name;
6596         size_t zone_namelen;
6597         zoneid_t zoneid;
6598         zone_t *zone;
6599         zone_cmd_arg_t arg;
6600         uint64_t uniqid;
6601         size_t size;
6602         int error;
6603         int retry;
6604 
6605         zone = zargp->zone;
6606         arg = zargp->arg;
6607         kmem_free(zargp, sizeof (*zargp));
6608 
6609         zone_namelen = strlen(zone->zone_name) + 1;
6610         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6611         bcopy(zone->zone_name, zone_name, zone_namelen);
6612         zoneid = zone->zone_id;
6613         uniqid = zone->zone_uniqid;
6614         /*
6615          * zoneadmd may be down, but at least we can empty out the zone.
6616          * We can ignore the return value of zone_empty() since we're called
6617          * from a kernel thread and know we won't be delivered any signals.
6618          */
6619         ASSERT(curproc == &p0);
6620         (void) zone_empty(zone);
6621         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6622         zone_rele(zone);
6623 
6624         size = sizeof (arg);
6625         darg.rbuf = (char *)&arg;
6626         darg.data_ptr = (char *)&arg;
6627         darg.rsize = size;
6628         darg.data_size = size;
6629         darg.desc_ptr = NULL;
6630         darg.desc_num = 0;
6631 
6632         save_arg = darg;
6633         /*
6634          * Since we're not holding a reference to the zone, any number of
6635          * things can go wrong, including the zone disappearing before we get a
6636          * chance to talk to zoneadmd.
6637          */
6638         for (retry = 0; /* forever */; retry++) {
6639                 if (door == NULL &&
6640                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6641                         goto next;
6642                 }
6643                 ASSERT(door != NULL);
6644 
6645                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6646                     SIZE_MAX, 0)) == 0) {
6647                         break;
6648                 }
6649                 switch (error) {
6650                 case EINTR:
6651                         /* FALLTHROUGH */
6652                 case EAGAIN:    /* process may be forking */
6653                         /*
6654                          * Back off for a bit
6655                          */
6656                         break;
6657                 case EBADF:
6658                         zone_release_door(&door);
6659                         if (zone_lookup_door(zone_name, &door) != 0) {
6660                                 /*
6661                                  * zoneadmd may be dead, but it may come back to
6662                                  * life later.
6663                                  */
6664                                 break;
6665                         }
6666                         break;
6667                 default:
6668                         cmn_err(CE_WARN,
6669                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6670                             error);
6671                         goto out;
6672                 }
6673 next:
6674                 /*
6675                  * If this isn't the same zone_t that we originally had in mind,
6676                  * then this is the same as if two kadmin requests come in at
6677                  * the same time: the first one wins.  This means we lose, so we
6678                  * bail.
6679                  */
6680                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6681                         /*
6682                          * Problem is solved.
6683                          */
6684                         break;
6685                 }
6686                 if (zone->zone_uniqid != uniqid) {
6687                         /*
6688                          * zoneid recycled
6689                          */
6690                         zone_rele(zone);
6691                         break;
6692                 }
6693                 /*
6694                  * We could zone_status_timedwait(), but there doesn't seem to
6695                  * be much point in doing that (plus, it would mean that
6696                  * zone_free() isn't called until this thread exits).
6697                  */
6698                 zone_rele(zone);
6699                 delay(hz);
6700                 darg = save_arg;
6701         }
6702 out:
6703         if (door != NULL) {
6704                 zone_release_door(&door);
6705         }
6706         kmem_free(zone_name, zone_namelen);
6707         thread_exit();
6708 }
6709 
6710 /*
6711  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6712  * kadmin().  The caller is a process in the zone.
6713  *
6714  * In order to shutdown the zone, we will hand off control to zoneadmd
6715  * (running in the global zone) via a door.  We do a half-hearted job at
6716  * killing all processes in the zone, create a kernel thread to contact
6717  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6718  * a form of generation number used to let zoneadmd (as well as
6719  * zone_destroy()) know exactly which zone they're re talking about.
6720  */
6721 int
6722 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6723 {
6724         struct zarg *zargp;
6725         zone_cmd_t zcmd;
6726         zone_t *zone;
6727 
6728         zone = curproc->p_zone;
6729         ASSERT(getzoneid() != GLOBAL_ZONEID);
6730 
6731         switch (cmd) {
6732         case A_SHUTDOWN:
6733                 switch (fcn) {
6734                 case AD_HALT:
6735                 case AD_POWEROFF:
6736                         zcmd = Z_HALT;
6737                         break;
6738                 case AD_BOOT:
6739                         zcmd = Z_REBOOT;
6740                         break;
6741                 case AD_IBOOT:
6742                 case AD_SBOOT:
6743                 case AD_SIBOOT:
6744                 case AD_NOSYNC:
6745                         return (ENOTSUP);
6746                 default:
6747                         return (EINVAL);
6748                 }
6749                 break;
6750         case A_REBOOT:
6751                 zcmd = Z_REBOOT;
6752                 break;
6753         case A_FTRACE:
6754         case A_REMOUNT:
6755         case A_FREEZE:
6756         case A_DUMP:
6757         case A_CONFIG:
6758                 return (ENOTSUP);
6759         default:
6760                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6761                 return (EINVAL);
6762         }
6763 
6764         if (secpolicy_zone_admin(credp, B_FALSE))
6765                 return (EPERM);
6766         mutex_enter(&zone_status_lock);
6767 
6768         /*
6769          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6770          * is in the zone.
6771          */
6772         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6773         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6774                 /*
6775                  * This zone is already on its way down.
6776                  */
6777                 mutex_exit(&zone_status_lock);
6778                 return (0);
6779         }
6780         /*
6781          * Prevent future zone_enter()s
6782          */
6783         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6784         mutex_exit(&zone_status_lock);
6785 
6786         /*
6787          * Kill everyone now and call zoneadmd later.
6788          * zone_ki_call_zoneadmd() will do a more thorough job of this
6789          * later.
6790          */
6791         killall(zone->zone_id);
6792         /*
6793          * Now, create the thread to contact zoneadmd and do the rest of the
6794          * work.  This thread can't be created in our zone otherwise
6795          * zone_destroy() would deadlock.
6796          */
6797         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6798         zargp->arg.cmd = zcmd;
6799         zargp->arg.uniqid = zone->zone_uniqid;
6800         zargp->zone = zone;
6801         (void) strcpy(zargp->arg.locale, "C");
6802         /* mdep was already copied in for us by uadmin */
6803         if (mdep != NULL)
6804                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6805                     sizeof (zargp->arg.bootbuf));
6806         zone_hold(zone);
6807 
6808         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6809             TS_RUN, minclsyspri);
6810         exit(CLD_EXITED, 0);
6811 
6812         return (EINVAL);
6813 }
6814 
6815 /*
6816  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6817  * status to ZONE_IS_SHUTTING_DOWN.
6818  *
6819  * This function also shuts down all running zones to ensure that they won't
6820  * fork new processes.
6821  */
6822 void
6823 zone_shutdown_global(void)
6824 {
6825         zone_t *current_zonep;
6826 
6827         ASSERT(INGLOBALZONE(curproc));
6828         mutex_enter(&zonehash_lock);
6829         mutex_enter(&zone_status_lock);
6830 
6831         /* Modify the global zone's status first. */
6832         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6833         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6834 
6835         /*
6836          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6837          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6838          * could cause assertions to fail (e.g., assertions about a zone's
6839          * state during initialization, readying, or booting) or produce races.
6840          * We'll let threads continue to initialize and ready new zones: they'll
6841          * fail to boot the new zones when they see that the global zone is
6842          * shutting down.
6843          */
6844         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6845             current_zonep = list_next(&zone_active, current_zonep)) {
6846                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6847                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6848         }
6849         mutex_exit(&zone_status_lock);
6850         mutex_exit(&zonehash_lock);
6851 }
6852 
6853 /*
6854  * Returns true if the named dataset is visible in the current zone.
6855  * The 'write' parameter is set to 1 if the dataset is also writable.
6856  */
6857 int
6858 zone_dataset_visible(const char *dataset, int *write)
6859 {
6860         static int zfstype = -1;
6861         zone_dataset_t *zd;
6862         size_t len;
6863         zone_t *zone = curproc->p_zone;
6864         const char *name = NULL;
6865         vfs_t *vfsp = NULL;
6866 
6867         if (dataset[0] == '\0')
6868                 return (0);
6869 
6870         /*
6871          * Walk the list once, looking for datasets which match exactly, or
6872          * specify a dataset underneath an exported dataset.  If found, return
6873          * true and note that it is writable.
6874          */
6875         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6876             zd = list_next(&zone->zone_datasets, zd)) {
6877 
6878                 len = strlen(zd->zd_dataset);
6879                 if (strlen(dataset) >= len &&
6880                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6881                     (dataset[len] == '\0' || dataset[len] == '/' ||
6882                     dataset[len] == '@')) {
6883                         if (write)
6884                                 *write = 1;
6885                         return (1);
6886                 }
6887         }
6888 
6889         /*
6890          * Walk the list a second time, searching for datasets which are parents
6891          * of exported datasets.  These should be visible, but read-only.
6892          *
6893          * Note that we also have to support forms such as 'pool/dataset/', with
6894          * a trailing slash.
6895          */
6896         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6897             zd = list_next(&zone->zone_datasets, zd)) {
6898 
6899                 len = strlen(dataset);
6900                 if (dataset[len - 1] == '/')
6901                         len--;  /* Ignore trailing slash */
6902                 if (len < strlen(zd->zd_dataset) &&
6903                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6904                     zd->zd_dataset[len] == '/') {
6905                         if (write)
6906                                 *write = 0;
6907                         return (1);
6908                 }
6909         }
6910 
6911         /*
6912          * We reach here if the given dataset is not found in the zone_dataset
6913          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6914          * instead of delegation. For this we search for the dataset in the
6915          * zone_vfslist of this zone. If found, return true and note that it is
6916          * not writable.
6917          */
6918 
6919         /*
6920          * Initialize zfstype if it is not initialized yet.
6921          */
6922         if (zfstype == -1) {
6923                 struct vfssw *vswp = vfs_getvfssw("zfs");
6924                 zfstype = vswp - vfssw;
6925                 vfs_unrefvfssw(vswp);
6926         }
6927 
6928         vfs_list_read_lock();
6929         vfsp = zone->zone_vfslist;
6930         do {
6931                 ASSERT(vfsp);
6932                 if (vfsp->vfs_fstype == zfstype) {
6933                         name = refstr_value(vfsp->vfs_resource);
6934 
6935                         /*
6936                          * Check if we have an exact match.
6937                          */
6938                         if (strcmp(dataset, name) == 0) {
6939                                 vfs_list_unlock();
6940                                 if (write)
6941                                         *write = 0;
6942                                 return (1);
6943                         }
6944                         /*
6945                          * We need to check if we are looking for parents of
6946                          * a dataset. These should be visible, but read-only.
6947                          */
6948                         len = strlen(dataset);
6949                         if (dataset[len - 1] == '/')
6950                                 len--;
6951 
6952                         if (len < strlen(name) &&
6953                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6954                                 vfs_list_unlock();
6955                                 if (write)
6956                                         *write = 0;
6957                                 return (1);
6958                         }
6959                 }
6960                 vfsp = vfsp->vfs_zone_next;
6961         } while (vfsp != zone->zone_vfslist);
6962 
6963         vfs_list_unlock();
6964         return (0);
6965 }
6966 
6967 /*
6968  * zone_find_by_any_path() -
6969  *
6970  * kernel-private routine similar to zone_find_by_path(), but which
6971  * effectively compares against zone paths rather than zonerootpath
6972  * (i.e., the last component of zonerootpaths, which should be "root/",
6973  * are not compared.)  This is done in order to accurately identify all
6974  * paths, whether zone-visible or not, including those which are parallel
6975  * to /root/, such as /dev/, /home/, etc...
6976  *
6977  * If the specified path does not fall under any zone path then global
6978  * zone is returned.
6979  *
6980  * The treat_abs parameter indicates whether the path should be treated as
6981  * an absolute path although it does not begin with "/".  (This supports
6982  * nfs mount syntax such as host:any/path.)
6983  *
6984  * The caller is responsible for zone_rele of the returned zone.
6985  */
6986 zone_t *
6987 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6988 {
6989         zone_t *zone;
6990         int path_offset = 0;
6991 
6992         if (path == NULL) {
6993                 zone_hold(global_zone);
6994                 return (global_zone);
6995         }
6996 
6997         if (*path != '/') {
6998                 ASSERT(treat_abs);
6999                 path_offset = 1;
7000         }
7001 
7002         mutex_enter(&zonehash_lock);
7003         for (zone = list_head(&zone_active); zone != NULL;
7004             zone = list_next(&zone_active, zone)) {
7005                 char    *c;
7006                 size_t  pathlen;
7007                 char *rootpath_start;
7008 
7009                 if (zone == global_zone)        /* skip global zone */
7010                         continue;
7011 
7012                 /* scan backwards to find start of last component */
7013                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7014                 do {
7015                         c--;
7016                 } while (*c != '/');
7017 
7018                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7019                 rootpath_start = (zone->zone_rootpath + path_offset);
7020                 if (strncmp(path, rootpath_start, pathlen) == 0)
7021                         break;
7022         }
7023         if (zone == NULL)
7024                 zone = global_zone;
7025         zone_hold(zone);
7026         mutex_exit(&zonehash_lock);
7027         return (zone);
7028 }
7029 
7030 /*
7031  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7032  * zone_dl_t pointer if found, and NULL otherwise.
7033  */
7034 static zone_dl_t *
7035 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7036 {
7037         zone_dl_t *zdl;
7038 
7039         ASSERT(mutex_owned(&zone->zone_lock));
7040         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7041             zdl = list_next(&zone->zone_dl_list, zdl)) {
7042                 if (zdl->zdl_id == linkid)
7043                         break;
7044         }
7045         return (zdl);
7046 }
7047 
7048 static boolean_t
7049 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7050 {
7051         boolean_t exists;
7052 
7053         mutex_enter(&zone->zone_lock);
7054         exists = (zone_find_dl(zone, linkid) != NULL);
7055         mutex_exit(&zone->zone_lock);
7056         return (exists);
7057 }
7058 
7059 /*
7060  * Add an data link name for the zone.
7061  */
7062 static int
7063 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7064 {
7065         zone_dl_t *zdl;
7066         zone_t *zone;
7067         zone_t *thiszone;
7068 
7069         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7070                 return (set_errno(ENXIO));
7071 
7072         /* Verify that the datalink ID doesn't already belong to a zone. */
7073         mutex_enter(&zonehash_lock);
7074         for (zone = list_head(&zone_active); zone != NULL;
7075             zone = list_next(&zone_active, zone)) {
7076                 if (zone_dl_exists(zone, linkid)) {
7077                         mutex_exit(&zonehash_lock);
7078                         zone_rele(thiszone);
7079                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7080                 }
7081         }
7082 
7083         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7084         zdl->zdl_id = linkid;
7085         zdl->zdl_net = NULL;
7086         mutex_enter(&thiszone->zone_lock);
7087         list_insert_head(&thiszone->zone_dl_list, zdl);
7088         mutex_exit(&thiszone->zone_lock);
7089         mutex_exit(&zonehash_lock);
7090         zone_rele(thiszone);
7091         return (0);
7092 }
7093 
7094 static int
7095 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7096 {
7097         zone_dl_t *zdl;
7098         zone_t *zone;
7099         int err = 0;
7100 
7101         if ((zone = zone_find_by_id(zoneid)) == NULL)
7102                 return (set_errno(EINVAL));
7103 
7104         mutex_enter(&zone->zone_lock);
7105         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7106                 err = ENXIO;
7107         } else {
7108                 list_remove(&zone->zone_dl_list, zdl);
7109                 nvlist_free(zdl->zdl_net);
7110                 kmem_free(zdl, sizeof (zone_dl_t));
7111         }
7112         mutex_exit(&zone->zone_lock);
7113         zone_rele(zone);
7114         return (err == 0 ? 0 : set_errno(err));
7115 }
7116 
7117 /*
7118  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7119  * the linkid.  Otherwise we just check if the specified zoneidp has been
7120  * assigned the supplied linkid.
7121  */
7122 int
7123 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7124 {
7125         zone_t *zone;
7126         int err = ENXIO;
7127 
7128         if (*zoneidp != ALL_ZONES) {
7129                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7130                         if (zone_dl_exists(zone, linkid))
7131                                 err = 0;
7132                         zone_rele(zone);
7133                 }
7134                 return (err);
7135         }
7136 
7137         mutex_enter(&zonehash_lock);
7138         for (zone = list_head(&zone_active); zone != NULL;
7139             zone = list_next(&zone_active, zone)) {
7140                 if (zone_dl_exists(zone, linkid)) {
7141                         *zoneidp = zone->zone_id;
7142                         err = 0;
7143                         break;
7144                 }
7145         }
7146         mutex_exit(&zonehash_lock);
7147         return (err);
7148 }
7149 
7150 /*
7151  * Get the list of datalink IDs assigned to a zone.
7152  *
7153  * On input, *nump is the number of datalink IDs that can fit in the supplied
7154  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7155  * that were placed in the array if the array was large enough, or to the
7156  * number of datalink IDs that the function needs to place in the array if the
7157  * array is too small.
7158  */
7159 static int
7160 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7161 {
7162         uint_t num, dlcount;
7163         zone_t *zone;
7164         zone_dl_t *zdl;
7165         datalink_id_t *idptr = idarray;
7166 
7167         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7168                 return (set_errno(EFAULT));
7169         if ((zone = zone_find_by_id(zoneid)) == NULL)
7170                 return (set_errno(ENXIO));
7171 
7172         num = 0;
7173         mutex_enter(&zone->zone_lock);
7174         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7175             zdl = list_next(&zone->zone_dl_list, zdl)) {
7176                 /*
7177                  * If the list is bigger than what the caller supplied, just
7178                  * count, don't do copyout.
7179                  */
7180                 if (++num > dlcount)
7181                         continue;
7182                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7183                         mutex_exit(&zone->zone_lock);
7184                         zone_rele(zone);
7185                         return (set_errno(EFAULT));
7186                 }
7187                 idptr++;
7188         }
7189         mutex_exit(&zone->zone_lock);
7190         zone_rele(zone);
7191 
7192         /* Increased or decreased, caller should be notified. */
7193         if (num != dlcount) {
7194                 if (copyout(&num, nump, sizeof (num)) != 0)
7195                         return (set_errno(EFAULT));
7196         }
7197         return (0);
7198 }
7199 
7200 /*
7201  * Public interface for looking up a zone by zoneid. It's a customized version
7202  * for netstack_zone_create(). It can only be called from the zsd create
7203  * callbacks, since it doesn't have reference on the zone structure hence if
7204  * it is called elsewhere the zone could disappear after the zonehash_lock
7205  * is dropped.
7206  *
7207  * Furthermore it
7208  * 1. Doesn't check the status of the zone.
7209  * 2. It will be called even before zone_init is called, in that case the
7210  *    address of zone0 is returned directly, and netstack_zone_create()
7211  *    will only assign a value to zone0.zone_netstack, won't break anything.
7212  * 3. Returns without the zone being held.
7213  */
7214 zone_t *
7215 zone_find_by_id_nolock(zoneid_t zoneid)
7216 {
7217         zone_t *zone;
7218 
7219         mutex_enter(&zonehash_lock);
7220         if (zonehashbyid == NULL)
7221                 zone = &zone0;
7222         else
7223                 zone = zone_find_all_by_id(zoneid);
7224         mutex_exit(&zonehash_lock);
7225         return (zone);
7226 }
7227 
7228 /*
7229  * Walk the datalinks for a given zone
7230  */
7231 int
7232 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7233     void *data)
7234 {
7235         zone_t          *zone;
7236         zone_dl_t       *zdl;
7237         datalink_id_t   *idarray;
7238         uint_t          idcount = 0;
7239         int             i, ret = 0;
7240 
7241         if ((zone = zone_find_by_id(zoneid)) == NULL)
7242                 return (ENOENT);
7243 
7244         /*
7245          * We first build an array of linkid's so that we can walk these and
7246          * execute the callback with the zone_lock dropped.
7247          */
7248         mutex_enter(&zone->zone_lock);
7249         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7250             zdl = list_next(&zone->zone_dl_list, zdl)) {
7251                 idcount++;
7252         }
7253 
7254         if (idcount == 0) {
7255                 mutex_exit(&zone->zone_lock);
7256                 zone_rele(zone);
7257                 return (0);
7258         }
7259 
7260         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7261         if (idarray == NULL) {
7262                 mutex_exit(&zone->zone_lock);
7263                 zone_rele(zone);
7264                 return (ENOMEM);
7265         }
7266 
7267         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7268             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7269                 idarray[i] = zdl->zdl_id;
7270         }
7271 
7272         mutex_exit(&zone->zone_lock);
7273 
7274         for (i = 0; i < idcount && ret == 0; i++) {
7275                 if ((ret = (*cb)(idarray[i], data)) != 0)
7276                         break;
7277         }
7278 
7279         zone_rele(zone);
7280         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7281         return (ret);
7282 }
7283 
7284 static char *
7285 zone_net_type2name(int type)
7286 {
7287         switch (type) {
7288         case ZONE_NETWORK_ADDRESS:
7289                 return (ZONE_NET_ADDRNAME);
7290         case ZONE_NETWORK_DEFROUTER:
7291                 return (ZONE_NET_RTRNAME);
7292         default:
7293                 return (NULL);
7294         }
7295 }
7296 
7297 static int
7298 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7299 {
7300         zone_t *zone;
7301         zone_dl_t *zdl;
7302         nvlist_t *nvl;
7303         int err = 0;
7304         uint8_t *new = NULL;
7305         char *nvname;
7306         int bufsize;
7307         datalink_id_t linkid = znbuf->zn_linkid;
7308 
7309         if (secpolicy_zone_config(CRED()) != 0)
7310                 return (set_errno(EPERM));
7311 
7312         if (zoneid == GLOBAL_ZONEID)
7313                 return (set_errno(EINVAL));
7314 
7315         nvname = zone_net_type2name(znbuf->zn_type);
7316         bufsize = znbuf->zn_len;
7317         new = znbuf->zn_val;
7318         if (nvname == NULL)
7319                 return (set_errno(EINVAL));
7320 
7321         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7322                 return (set_errno(EINVAL));
7323         }
7324 
7325         mutex_enter(&zone->zone_lock);
7326         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7327                 err = ENXIO;
7328                 goto done;
7329         }
7330         if ((nvl = zdl->zdl_net) == NULL) {
7331                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7332                         err = ENOMEM;
7333                         goto done;
7334                 } else {
7335                         zdl->zdl_net = nvl;
7336                 }
7337         }
7338         if (nvlist_exists(nvl, nvname)) {
7339                 err = EINVAL;
7340                 goto done;
7341         }
7342         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7343         ASSERT(err == 0);
7344 done:
7345         mutex_exit(&zone->zone_lock);
7346         zone_rele(zone);
7347         if (err != 0)
7348                 return (set_errno(err));
7349         else
7350                 return (0);
7351 }
7352 
7353 static int
7354 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7355 {
7356         zone_t *zone;
7357         zone_dl_t *zdl;
7358         nvlist_t *nvl;
7359         uint8_t *ptr;
7360         uint_t psize;
7361         int err = 0;
7362         char *nvname;
7363         int bufsize;
7364         void *buf;
7365         datalink_id_t linkid = znbuf->zn_linkid;
7366 
7367         if (zoneid == GLOBAL_ZONEID)
7368                 return (set_errno(EINVAL));
7369 
7370         nvname = zone_net_type2name(znbuf->zn_type);
7371         bufsize = znbuf->zn_len;
7372         buf = znbuf->zn_val;
7373 
7374         if (nvname == NULL)
7375                 return (set_errno(EINVAL));
7376         if ((zone = zone_find_by_id(zoneid)) == NULL)
7377                 return (set_errno(EINVAL));
7378 
7379         mutex_enter(&zone->zone_lock);
7380         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7381                 err = ENXIO;
7382                 goto done;
7383         }
7384         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7385                 err = ENOENT;
7386                 goto done;
7387         }
7388         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7389         ASSERT(err == 0);
7390 
7391         if (psize > bufsize) {
7392                 err = ENOBUFS;
7393                 goto done;
7394         }
7395         znbuf->zn_len = psize;
7396         bcopy(ptr, buf, psize);
7397 done:
7398         mutex_exit(&zone->zone_lock);
7399         zone_rele(zone);
7400         if (err != 0)
7401                 return (set_errno(err));
7402         else
7403                 return (0);
7404 }