1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 /*
  30  * Zones
  31  *
  32  *   A zone is a named collection of processes, namespace constraints,
  33  *   and other system resources which comprise a secure and manageable
  34  *   application containment facility.
  35  *
  36  *   Zones (represented by the reference counted zone_t) are tracked in
  37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38  *   (zoneid_t) are used to track zone association.  Zone IDs are
  39  *   dynamically generated when the zone is created; if a persistent
  40  *   identifier is needed (core files, accounting logs, audit trail,
  41  *   etc.), the zone name should be used.
  42  *
  43  *
  44  *   Global Zone:
  45  *
  46  *   The global zone (zoneid 0) is automatically associated with all
  47  *   system resources that have not been bound to a user-created zone.
  48  *   This means that even systems where zones are not in active use
  49  *   have a global zone, and all processes, mounts, etc. are
  50  *   associated with that zone.  The global zone is generally
  51  *   unconstrained in terms of privileges and access, though the usual
  52  *   credential and privilege based restrictions apply.
  53  *
  54  *
  55  *   Zone States:
  56  *
  57  *   The states in which a zone may be in and the transitions are as
  58  *   follows:
  59  *
  60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61  *   initialized zone is added to the list of active zones on the system but
  62  *   isn't accessible.
  63  *
  64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65  *   not yet completed. Not possible to enter the zone, but attributes can
  66  *   be retrieved.
  67  *
  68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70  *   executed.  A zone remains in this state until it transitions into
  71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72  *
  73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75  *   state.
  76  *
  77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78  *   successfully started init.   A zone remains in this state until
  79  *   zone_shutdown() is called.
  80  *
  81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82  *   killing all processes running in the zone. The zone remains
  83  *   in this state until there are no more user processes running in the zone.
  84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85  *   Since zone_shutdown() is restartable, it may be called successfully
  86  *   multiple times for the same zone_t.  Setting of the zone's state to
  87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88  *   the zone's status without worrying about it being a moving target.
  89  *
  90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91  *   are no more user processes in the zone.  The zone remains in this
  92  *   state until there are no more kernel threads associated with the
  93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94  *   fail.
  95  *
  96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98  *   join the zone or create kernel threads therein.
  99  *
 100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  *   return NULL from now on.
 103  *
 104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  *   processes or threads doing work on behalf of the zone.  The zone is
 106  *   removed from the list of active zones.  zone_destroy() returns, and
 107  *   the zone can be recreated.
 108  *
 109  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  *   callbacks are executed, and all memory associated with the zone is
 111  *   freed.
 112  *
 113  *   Threads can wait for the zone to enter a requested state by using
 114  *   zone_status_wait() or zone_status_timedwait() with the desired
 115  *   state passed in as an argument.  Zone state transitions are
 116  *   uni-directional; it is not possible to move back to an earlier state.
 117  *
 118  *
 119  *   Zone-Specific Data:
 120  *
 121  *   Subsystems needing to maintain zone-specific data can store that
 122  *   data using the ZSD mechanism.  This provides a zone-specific data
 123  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  *   to register callbacks to be invoked when a zone is created, shut
 126  *   down, or destroyed.  This can be used to initialize zone-specific
 127  *   data for new zones and to clean up when zones go away.
 128  *
 129  *
 130  *   Data Structures:
 131  *
 132  *   The per-zone structure (zone_t) is reference counted, and freed
 133  *   when all references are released.  zone_hold and zone_rele can be
 134  *   used to adjust the reference count.  In addition, reference counts
 135  *   associated with the cred_t structure are tracked separately using
 136  *   zone_cred_hold and zone_cred_rele.
 137  *
 138  *   Pointers to active zone_t's are stored in two hash tables; one
 139  *   for searching by id, the other for searching by name.  Lookups
 140  *   can be performed on either basis, using zone_find_by_id and
 141  *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  *   held, so zone_rele should be called when the pointer is no longer
 143  *   needed.  Zones can also be searched by path; zone_find_by_path
 144  *   returns the zone with which a path name is associated (global
 145  *   zone if the path is not within some other zone's file system
 146  *   hierarchy).  This currently requires iterating through each zone,
 147  *   so it is slower than an id or name search via a hash table.
 148  *
 149  *
 150  *   Locking:
 151  *
 152  *   zonehash_lock: This is a top-level global lock used to protect the
 153  *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  *       while this lock is held.
 155  *   zone_status_lock: This is a global lock protecting zone state.
 156  *       Zones cannot change state while this lock is held.  It also
 157  *       protects the list of kernel threads associated with a zone.
 158  *   zone_lock: This is a per-zone lock used to protect several fields of
 159  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  *       this lock means that the zone cannot go away.
 161  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-lwps rctl.
 163  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  *       currently just max_lofi
 167  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  *       list (a list of zones in the ZONE_IS_DEAD state).
 170  *
 171  *   Ordering requirements:
 172  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  *
 175  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  *
 179  *   Blocking memory allocations are permitted while holding any of the
 180  *   zone locks.
 181  *
 182  *
 183  *   System Call Interface:
 184  *
 185  *   The zone subsystem can be managed and queried from user level with
 186  *   the following system calls (all subcodes of the primary "zone"
 187  *   system call):
 188  *   - zone_create: creates a zone with selected attributes (name,
 189  *     root path, privileges, resource controls, ZFS datasets)
 190  *   - zone_enter: allows the current process to enter a zone
 191  *   - zone_getattr: reports attributes of a zone
 192  *   - zone_setattr: set attributes of a zone
 193  *   - zone_boot: set 'init' running for the zone
 194  *   - zone_list: lists all zones active in the system
 195  *   - zone_lookup: looks up zone id based on name
 196  *   - zone_shutdown: initiates shutdown process (see states above)
 197  *   - zone_destroy: completes shutdown process (see states above)
 198  *
 199  */
 200 
 201 #include <sys/priv_impl.h>
 202 #include <sys/cred.h>
 203 #include <c2/audit.h>
 204 #include <sys/debug.h>
 205 #include <sys/file.h>
 206 #include <sys/kmem.h>
 207 #include <sys/kstat.h>
 208 #include <sys/mutex.h>
 209 #include <sys/note.h>
 210 #include <sys/pathname.h>
 211 #include <sys/proc.h>
 212 #include <sys/project.h>
 213 #include <sys/sysevent.h>
 214 #include <sys/task.h>
 215 #include <sys/systm.h>
 216 #include <sys/types.h>
 217 #include <sys/utsname.h>
 218 #include <sys/vnode.h>
 219 #include <sys/vfs.h>
 220 #include <sys/systeminfo.h>
 221 #include <sys/policy.h>
 222 #include <sys/cred_impl.h>
 223 #include <sys/contract_impl.h>
 224 #include <sys/contract/process_impl.h>
 225 #include <sys/class.h>
 226 #include <sys/pool.h>
 227 #include <sys/pool_pset.h>
 228 #include <sys/pset.h>
 229 #include <sys/strlog.h>
 230 #include <sys/sysmacros.h>
 231 #include <sys/callb.h>
 232 #include <sys/vmparam.h>
 233 #include <sys/corectl.h>
 234 #include <sys/ipc_impl.h>
 235 #include <sys/klpd.h>
 236 
 237 #include <sys/door.h>
 238 #include <sys/cpuvar.h>
 239 #include <sys/sdt.h>
 240 
 241 #include <sys/uadmin.h>
 242 #include <sys/session.h>
 243 #include <sys/cmn_err.h>
 244 #include <sys/modhash.h>
 245 #include <sys/sunddi.h>
 246 #include <sys/nvpair.h>
 247 #include <sys/rctl.h>
 248 #include <sys/fss.h>
 249 #include <sys/brand.h>
 250 #include <sys/zone.h>
 251 #include <net/if.h>
 252 #include <sys/cpucaps.h>
 253 #include <vm/seg.h>
 254 #include <sys/mac.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285 
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298 
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304 
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316 
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321 
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329 
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332 
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335 
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354 
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_max_lofi;
 376 rctl_hndl_t rc_zone_cpu_cap;
 377 rctl_hndl_t rc_zone_nlwps;
 378 rctl_hndl_t rc_zone_nprocs;
 379 rctl_hndl_t rc_zone_shmmax;
 380 rctl_hndl_t rc_zone_shmmni;
 381 rctl_hndl_t rc_zone_semmni;
 382 rctl_hndl_t rc_zone_msgmni;
 383 
 384 const char * const zone_default_initname = "/sbin/init";
 385 static char * const zone_prefix = "/zone/";
 386 static int zone_shutdown(zoneid_t zoneid);
 387 static int zone_add_datalink(zoneid_t, datalink_id_t);
 388 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390 static int zone_set_network(zoneid_t, zone_net_data_t *);
 391 static int zone_get_network(zoneid_t, zone_net_data_t *);
 392 
 393 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394 
 395 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399     zone_key_t);
 400 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402     kmutex_t *);
 403 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404     kmutex_t *);
 405 
 406 /*
 407  * Bump this number when you alter the zone syscall interfaces; this is
 408  * because we need to have support for previous API versions in libc
 409  * to support patching; libc calls into the kernel to determine this number.
 410  *
 411  * Version 1 of the API is the version originally shipped with Solaris 10
 412  * Version 2 alters the zone_create system call in order to support more
 413  *     arguments by moving the args into a structure; and to do better
 414  *     error reporting when zone_create() fails.
 415  * Version 3 alters the zone_create system call in order to support the
 416  *     import of ZFS datasets to zones.
 417  * Version 4 alters the zone_create system call in order to support
 418  *     Trusted Extensions.
 419  * Version 5 alters the zone_boot system call, and converts its old
 420  *     bootargs parameter to be set by the zone_setattr API instead.
 421  * Version 6 adds the flag argument to zone_create.
 422  */
 423 static const int ZONE_SYSCALL_API_VERSION = 6;
 424 
 425 /*
 426  * Certain filesystems (such as NFS and autofs) need to know which zone
 427  * the mount is being placed in.  Because of this, we need to be able to
 428  * ensure that a zone isn't in the process of being created/destroyed such
 429  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  * mount list. Since a zone can't reside on an NFS file system, we don't
 432  * have to worry about the zonepath itself.
 433  *
 434  * The following functions: block_mounts()/resume_mounts() and
 435  * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  * layer (respectively) to synchronize zone state transitions and new
 437  * mounts within a zone. This syncronization is on a per-zone basis, so
 438  * activity for one zone will not interfere with activity for another zone.
 439  *
 440  * The semantics are like a reader-reader lock such that there may
 441  * either be multiple mounts (or zone state transitions, if that weren't
 442  * serialized by zonehash_lock) in progress at the same time, but not
 443  * both.
 444  *
 445  * We use cv's so the user can ctrl-C out of the operation if it's
 446  * taking too long.
 447  *
 448  * The semantics are such that there is unfair bias towards the
 449  * "current" operation.  This means that zone halt may starve if
 450  * there is a rapid succession of new mounts coming in to the zone.
 451  */
 452 /*
 453  * Prevent new mounts from progressing to the point of calling
 454  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  * them to complete.
 456  */
 457 static int
 458 block_mounts(zone_t *zp)
 459 {
 460         int retval = 0;
 461 
 462         /*
 463          * Since it may block for a long time, block_mounts() shouldn't be
 464          * called with zonehash_lock held.
 465          */
 466         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467         mutex_enter(&zp->zone_mount_lock);
 468         while (zp->zone_mounts_in_progress > 0) {
 469                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470                         goto signaled;
 471         }
 472         /*
 473          * A negative value of mounts_in_progress indicates that mounts
 474          * have been blocked by (-mounts_in_progress) different callers
 475          * (remotely possible if two threads enter zone_shutdown at the same
 476          * time).
 477          */
 478         zp->zone_mounts_in_progress--;
 479         retval = 1;
 480 signaled:
 481         mutex_exit(&zp->zone_mount_lock);
 482         return (retval);
 483 }
 484 
 485 /*
 486  * The VFS layer may progress with new mounts as far as we're concerned.
 487  * Allow them to progress if we were the last obstacle.
 488  */
 489 static void
 490 resume_mounts(zone_t *zp)
 491 {
 492         mutex_enter(&zp->zone_mount_lock);
 493         if (++zp->zone_mounts_in_progress == 0)
 494                 cv_broadcast(&zp->zone_mount_cv);
 495         mutex_exit(&zp->zone_mount_lock);
 496 }
 497 
 498 /*
 499  * The VFS layer is busy with a mount; this zone should wait until all
 500  * of its mounts are completed to progress.
 501  */
 502 void
 503 mount_in_progress(zone_t *zp)
 504 {
 505         mutex_enter(&zp->zone_mount_lock);
 506         while (zp->zone_mounts_in_progress < 0)
 507                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508         zp->zone_mounts_in_progress++;
 509         mutex_exit(&zp->zone_mount_lock);
 510 }
 511 
 512 /*
 513  * VFS is done with one mount; wake up any waiting block_mounts()
 514  * callers if this is the last mount.
 515  */
 516 void
 517 mount_completed(zone_t *zp)
 518 {
 519         mutex_enter(&zp->zone_mount_lock);
 520         if (--zp->zone_mounts_in_progress == 0)
 521                 cv_broadcast(&zp->zone_mount_cv);
 522         mutex_exit(&zp->zone_mount_lock);
 523 }
 524 
 525 /*
 526  * ZSD routines.
 527  *
 528  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  * defined by the pthread_key_create() and related interfaces.
 530  *
 531  * Kernel subsystems may register one or more data items and/or
 532  * callbacks to be executed when a zone is created, shutdown, or
 533  * destroyed.
 534  *
 535  * Unlike the thread counterpart, destructor callbacks will be executed
 536  * even if the data pointer is NULL and/or there are no constructor
 537  * callbacks, so it is the responsibility of such callbacks to check for
 538  * NULL data values if necessary.
 539  *
 540  * The locking strategy and overall picture is as follows:
 541  *
 542  * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  * holding that lock all the existing zones are marked as
 545  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  * zone_zsd list (protected by zone_lock). The global list is updated first
 547  * (under zone_key_lock) to make sure that newly created zones use the
 548  * most recent list of keys. Then under zonehash_lock we walk the zones
 549  * and mark them.  Similar locking is used in zone_key_delete().
 550  *
 551  * The actual create, shutdown, and destroy callbacks are done without
 552  * holding any lock. And zsd_flags are used to ensure that the operations
 553  * completed so that when zone_key_create (and zone_create) is done, as well as
 554  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  * are completed.
 556  *
 557  * When new zones are created constructor callbacks for all registered ZSD
 558  * entries will be called. That also uses the above two phases of marking
 559  * what needs to be done, and then running the callbacks without holding
 560  * any locks.
 561  *
 562  * The framework does not provide any locking around zone_getspecific() and
 563  * zone_setspecific() apart from that needed for internal consistency, so
 564  * callers interested in atomic "test-and-set" semantics will need to provide
 565  * their own locking.
 566  */
 567 
 568 /*
 569  * Helper function to find the zsd_entry associated with the key in the
 570  * given list.
 571  */
 572 static struct zsd_entry *
 573 zsd_find(list_t *l, zone_key_t key)
 574 {
 575         struct zsd_entry *zsd;
 576 
 577         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578                 if (zsd->zsd_key == key) {
 579                         return (zsd);
 580                 }
 581         }
 582         return (NULL);
 583 }
 584 
 585 /*
 586  * Helper function to find the zsd_entry associated with the key in the
 587  * given list. Move it to the front of the list.
 588  */
 589 static struct zsd_entry *
 590 zsd_find_mru(list_t *l, zone_key_t key)
 591 {
 592         struct zsd_entry *zsd;
 593 
 594         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595                 if (zsd->zsd_key == key) {
 596                         /*
 597                          * Move to head of list to keep list in MRU order.
 598                          */
 599                         if (zsd != list_head(l)) {
 600                                 list_remove(l, zsd);
 601                                 list_insert_head(l, zsd);
 602                         }
 603                         return (zsd);
 604                 }
 605         }
 606         return (NULL);
 607 }
 608 
 609 void
 610 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612 {
 613         struct zsd_entry *zsdp;
 614         struct zsd_entry *t;
 615         struct zone *zone;
 616         zone_key_t  key;
 617 
 618         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619         zsdp->zsd_data = NULL;
 620         zsdp->zsd_create = create;
 621         zsdp->zsd_shutdown = shutdown;
 622         zsdp->zsd_destroy = destroy;
 623 
 624         /*
 625          * Insert in global list of callbacks. Makes future zone creations
 626          * see it.
 627          */
 628         mutex_enter(&zsd_key_lock);
 629         key = zsdp->zsd_key = ++zsd_keyval;
 630         ASSERT(zsd_keyval != 0);
 631         list_insert_tail(&zsd_registered_keys, zsdp);
 632         mutex_exit(&zsd_key_lock);
 633 
 634         /*
 635          * Insert for all existing zones and mark them as needing
 636          * a create callback.
 637          */
 638         mutex_enter(&zonehash_lock);        /* stop the world */
 639         for (zone = list_head(&zone_active); zone != NULL;
 640             zone = list_next(&zone_active, zone)) {
 641                 zone_status_t status;
 642 
 643                 mutex_enter(&zone->zone_lock);
 644 
 645                 /* Skip zones that are on the way down or not yet up */
 646                 status = zone_status_get(zone);
 647                 if (status >= ZONE_IS_DOWN ||
 648                     status == ZONE_IS_UNINITIALIZED) {
 649                         mutex_exit(&zone->zone_lock);
 650                         continue;
 651                 }
 652 
 653                 t = zsd_find_mru(&zone->zone_zsd, key);
 654                 if (t != NULL) {
 655                         /*
 656                          * A zsd_configure already inserted it after
 657                          * we dropped zsd_key_lock above.
 658                          */
 659                         mutex_exit(&zone->zone_lock);
 660                         continue;
 661                 }
 662                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663                 t->zsd_key = key;
 664                 t->zsd_create = create;
 665                 t->zsd_shutdown = shutdown;
 666                 t->zsd_destroy = destroy;
 667                 if (create != NULL) {
 668                         t->zsd_flags = ZSD_CREATE_NEEDED;
 669                         DTRACE_PROBE2(zsd__create__needed,
 670                             zone_t *, zone, zone_key_t, key);
 671                 }
 672                 list_insert_tail(&zone->zone_zsd, t);
 673                 mutex_exit(&zone->zone_lock);
 674         }
 675         mutex_exit(&zonehash_lock);
 676 
 677         if (create != NULL) {
 678                 /* Now call the create callback for this key */
 679                 zsd_apply_all_zones(zsd_apply_create, key);
 680         }
 681         /*
 682          * It is safe for consumers to use the key now, make it
 683          * globally visible. Specifically zone_getspecific() will
 684          * always successfully return the zone specific data associated
 685          * with the key.
 686          */
 687         *keyp = key;
 688 
 689 }
 690 
 691 /*
 692  * Function called when a module is being unloaded, or otherwise wishes
 693  * to unregister its ZSD key and callbacks.
 694  *
 695  * Remove from the global list and determine the functions that need to
 696  * be called under a global lock. Then call the functions without
 697  * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  */
 700 int
 701 zone_key_delete(zone_key_t key)
 702 {
 703         struct zsd_entry *zsdp = NULL;
 704         zone_t *zone;
 705 
 706         mutex_enter(&zsd_key_lock);
 707         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708         if (zsdp == NULL) {
 709                 mutex_exit(&zsd_key_lock);
 710                 return (-1);
 711         }
 712         list_remove(&zsd_registered_keys, zsdp);
 713         mutex_exit(&zsd_key_lock);
 714 
 715         mutex_enter(&zonehash_lock);
 716         for (zone = list_head(&zone_active); zone != NULL;
 717             zone = list_next(&zone_active, zone)) {
 718                 struct zsd_entry *del;
 719 
 720                 mutex_enter(&zone->zone_lock);
 721                 del = zsd_find_mru(&zone->zone_zsd, key);
 722                 if (del == NULL) {
 723                         /*
 724                          * Somebody else got here first e.g the zone going
 725                          * away.
 726                          */
 727                         mutex_exit(&zone->zone_lock);
 728                         continue;
 729                 }
 730                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732                 if (del->zsd_shutdown != NULL &&
 733                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735                         DTRACE_PROBE2(zsd__shutdown__needed,
 736                             zone_t *, zone, zone_key_t, key);
 737                 }
 738                 if (del->zsd_destroy != NULL &&
 739                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741                         DTRACE_PROBE2(zsd__destroy__needed,
 742                             zone_t *, zone, zone_key_t, key);
 743                 }
 744                 mutex_exit(&zone->zone_lock);
 745         }
 746         mutex_exit(&zonehash_lock);
 747         kmem_free(zsdp, sizeof (*zsdp));
 748 
 749         /* Now call the shutdown and destroy callback for this key */
 750         zsd_apply_all_zones(zsd_apply_shutdown, key);
 751         zsd_apply_all_zones(zsd_apply_destroy, key);
 752 
 753         /* Now we can free up the zsdp structures in each zone */
 754         mutex_enter(&zonehash_lock);
 755         for (zone = list_head(&zone_active); zone != NULL;
 756             zone = list_next(&zone_active, zone)) {
 757                 struct zsd_entry *del;
 758 
 759                 mutex_enter(&zone->zone_lock);
 760                 del = zsd_find(&zone->zone_zsd, key);
 761                 if (del != NULL) {
 762                         list_remove(&zone->zone_zsd, del);
 763                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764                         kmem_free(del, sizeof (*del));
 765                 }
 766                 mutex_exit(&zone->zone_lock);
 767         }
 768         mutex_exit(&zonehash_lock);
 769 
 770         return (0);
 771 }
 772 
 773 /*
 774  * ZSD counterpart of pthread_setspecific().
 775  *
 776  * Since all zsd callbacks, including those with no create function,
 777  * have an entry in zone_zsd, if the key is registered it is part of
 778  * the zone_zsd list.
 779  * Return an error if the key wasn't registerd.
 780  */
 781 int
 782 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783 {
 784         struct zsd_entry *t;
 785 
 786         mutex_enter(&zone->zone_lock);
 787         t = zsd_find_mru(&zone->zone_zsd, key);
 788         if (t != NULL) {
 789                 /*
 790                  * Replace old value with new
 791                  */
 792                 t->zsd_data = (void *)data;
 793                 mutex_exit(&zone->zone_lock);
 794                 return (0);
 795         }
 796         mutex_exit(&zone->zone_lock);
 797         return (-1);
 798 }
 799 
 800 /*
 801  * ZSD counterpart of pthread_getspecific().
 802  */
 803 void *
 804 zone_getspecific(zone_key_t key, zone_t *zone)
 805 {
 806         struct zsd_entry *t;
 807         void *data;
 808 
 809         mutex_enter(&zone->zone_lock);
 810         t = zsd_find_mru(&zone->zone_zsd, key);
 811         data = (t == NULL ? NULL : t->zsd_data);
 812         mutex_exit(&zone->zone_lock);
 813         return (data);
 814 }
 815 
 816 /*
 817  * Function used to initialize a zone's list of ZSD callbacks and data
 818  * when the zone is being created.  The callbacks are initialized from
 819  * the template list (zsd_registered_keys). The constructor callback is
 820  * executed later (once the zone exists and with locks dropped).
 821  */
 822 static void
 823 zone_zsd_configure(zone_t *zone)
 824 {
 825         struct zsd_entry *zsdp;
 826         struct zsd_entry *t;
 827 
 828         ASSERT(MUTEX_HELD(&zonehash_lock));
 829         ASSERT(list_head(&zone->zone_zsd) == NULL);
 830         mutex_enter(&zone->zone_lock);
 831         mutex_enter(&zsd_key_lock);
 832         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834                 /*
 835                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836                  * should not have added anything to it.
 837                  */
 838                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839 
 840                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841                 t->zsd_key = zsdp->zsd_key;
 842                 t->zsd_create = zsdp->zsd_create;
 843                 t->zsd_shutdown = zsdp->zsd_shutdown;
 844                 t->zsd_destroy = zsdp->zsd_destroy;
 845                 if (zsdp->zsd_create != NULL) {
 846                         t->zsd_flags = ZSD_CREATE_NEEDED;
 847                         DTRACE_PROBE2(zsd__create__needed,
 848                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849                 }
 850                 list_insert_tail(&zone->zone_zsd, t);
 851         }
 852         mutex_exit(&zsd_key_lock);
 853         mutex_exit(&zone->zone_lock);
 854 }
 855 
 856 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857 
 858 /*
 859  * Helper function to execute shutdown or destructor callbacks.
 860  */
 861 static void
 862 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863 {
 864         struct zsd_entry *t;
 865 
 866         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869 
 870         /*
 871          * Run the callback solely based on what is registered for the zone
 872          * in zone_zsd. The global list can change independently of this
 873          * as keys are registered and unregistered and we don't register new
 874          * callbacks for a zone that is in the process of going away.
 875          */
 876         mutex_enter(&zone->zone_lock);
 877         for (t = list_head(&zone->zone_zsd); t != NULL;
 878             t = list_next(&zone->zone_zsd, t)) {
 879                 zone_key_t key = t->zsd_key;
 880 
 881                 /* Skip if no callbacks registered */
 882 
 883                 if (ct == ZSD_SHUTDOWN) {
 884                         if (t->zsd_shutdown != NULL &&
 885                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887                                 DTRACE_PROBE2(zsd__shutdown__needed,
 888                                     zone_t *, zone, zone_key_t, key);
 889                         }
 890                 } else {
 891                         if (t->zsd_destroy != NULL &&
 892                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894                                 DTRACE_PROBE2(zsd__destroy__needed,
 895                                     zone_t *, zone, zone_key_t, key);
 896                         }
 897                 }
 898         }
 899         mutex_exit(&zone->zone_lock);
 900 
 901         /* Now call the shutdown and destroy callback for this key */
 902         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903         zsd_apply_all_keys(zsd_apply_destroy, zone);
 904 
 905 }
 906 
 907 /*
 908  * Called when the zone is going away; free ZSD-related memory, and
 909  * destroy the zone_zsd list.
 910  */
 911 static void
 912 zone_free_zsd(zone_t *zone)
 913 {
 914         struct zsd_entry *t, *next;
 915 
 916         /*
 917          * Free all the zsd_entry's we had on this zone.
 918          */
 919         mutex_enter(&zone->zone_lock);
 920         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921                 next = list_next(&zone->zone_zsd, t);
 922                 list_remove(&zone->zone_zsd, t);
 923                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924                 kmem_free(t, sizeof (*t));
 925         }
 926         list_destroy(&zone->zone_zsd);
 927         mutex_exit(&zone->zone_lock);
 928 
 929 }
 930 
 931 /*
 932  * Apply a function to all zones for particular key value.
 933  *
 934  * The applyfn has to drop zonehash_lock if it does some work, and
 935  * then reacquire it before it returns.
 936  * When the lock is dropped we don't follow list_next even
 937  * if it is possible to do so without any hazards. This is
 938  * because we want the design to allow for the list of zones
 939  * to change in any arbitrary way during the time the
 940  * lock was dropped.
 941  *
 942  * It is safe to restart the loop at list_head since the applyfn
 943  * changes the zsd_flags as it does work, so a subsequent
 944  * pass through will have no effect in applyfn, hence the loop will terminate
 945  * in at worst O(N^2).
 946  */
 947 static void
 948 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949 {
 950         zone_t *zone;
 951 
 952         mutex_enter(&zonehash_lock);
 953         zone = list_head(&zone_active);
 954         while (zone != NULL) {
 955                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956                         /* Lock dropped - restart at head */
 957                         zone = list_head(&zone_active);
 958                 } else {
 959                         zone = list_next(&zone_active, zone);
 960                 }
 961         }
 962         mutex_exit(&zonehash_lock);
 963 }
 964 
 965 /*
 966  * Apply a function to all keys for a particular zone.
 967  *
 968  * The applyfn has to drop zonehash_lock if it does some work, and
 969  * then reacquire it before it returns.
 970  * When the lock is dropped we don't follow list_next even
 971  * if it is possible to do so without any hazards. This is
 972  * because we want the design to allow for the list of zsd callbacks
 973  * to change in any arbitrary way during the time the
 974  * lock was dropped.
 975  *
 976  * It is safe to restart the loop at list_head since the applyfn
 977  * changes the zsd_flags as it does work, so a subsequent
 978  * pass through will have no effect in applyfn, hence the loop will terminate
 979  * in at worst O(N^2).
 980  */
 981 static void
 982 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983 {
 984         struct zsd_entry *t;
 985 
 986         mutex_enter(&zone->zone_lock);
 987         t = list_head(&zone->zone_zsd);
 988         while (t != NULL) {
 989                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990                         /* Lock dropped - restart at head */
 991                         t = list_head(&zone->zone_zsd);
 992                 } else {
 993                         t = list_next(&zone->zone_zsd, t);
 994                 }
 995         }
 996         mutex_exit(&zone->zone_lock);
 997 }
 998 
 999 /*
1000  * Call the create function for the zone and key if CREATE_NEEDED
1001  * is set.
1002  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003  * we wait for that thread to complete so that we can ensure that
1004  * all the callbacks are done when we've looped over all zones/keys.
1005  *
1006  * When we call the create function, we drop the global held by the
1007  * caller, and return true to tell the caller it needs to re-evalute the
1008  * state.
1009  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010  * remains held on exit.
1011  */
1012 static boolean_t
1013 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014     zone_t *zone, zone_key_t key)
1015 {
1016         void *result;
1017         struct zsd_entry *t;
1018         boolean_t dropped;
1019 
1020         if (lockp != NULL) {
1021                 ASSERT(MUTEX_HELD(lockp));
1022         }
1023         if (zone_lock_held) {
1024                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1025         } else {
1026                 mutex_enter(&zone->zone_lock);
1027         }
1028 
1029         t = zsd_find(&zone->zone_zsd, key);
1030         if (t == NULL) {
1031                 /*
1032                  * Somebody else got here first e.g the zone going
1033                  * away.
1034                  */
1035                 if (!zone_lock_held)
1036                         mutex_exit(&zone->zone_lock);
1037                 return (B_FALSE);
1038         }
1039         dropped = B_FALSE;
1040         if (zsd_wait_for_inprogress(zone, t, lockp))
1041                 dropped = B_TRUE;
1042 
1043         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046                 DTRACE_PROBE2(zsd__create__inprogress,
1047                     zone_t *, zone, zone_key_t, key);
1048                 mutex_exit(&zone->zone_lock);
1049                 if (lockp != NULL)
1050                         mutex_exit(lockp);
1051 
1052                 dropped = B_TRUE;
1053                 ASSERT(t->zsd_create != NULL);
1054                 DTRACE_PROBE2(zsd__create__start,
1055                     zone_t *, zone, zone_key_t, key);
1056 
1057                 result = (*t->zsd_create)(zone->zone_id);
1058 
1059                 DTRACE_PROBE2(zsd__create__end,
1060                     zone_t *, zone, voidn *, result);
1061 
1062                 ASSERT(result != NULL);
1063                 if (lockp != NULL)
1064                         mutex_enter(lockp);
1065                 mutex_enter(&zone->zone_lock);
1066                 t->zsd_data = result;
1067                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069                 cv_broadcast(&t->zsd_cv);
1070                 DTRACE_PROBE2(zsd__create__completed,
1071                     zone_t *, zone, zone_key_t, key);
1072         }
1073         if (!zone_lock_held)
1074                 mutex_exit(&zone->zone_lock);
1075         return (dropped);
1076 }
1077 
1078 /*
1079  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080  * is set.
1081  * If some other thread gets here first and sets *_INPROGRESS, then
1082  * we wait for that thread to complete so that we can ensure that
1083  * all the callbacks are done when we've looped over all zones/keys.
1084  *
1085  * When we call the shutdown function, we drop the global held by the
1086  * caller, and return true to tell the caller it needs to re-evalute the
1087  * state.
1088  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089  * remains held on exit.
1090  */
1091 static boolean_t
1092 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093     zone_t *zone, zone_key_t key)
1094 {
1095         struct zsd_entry *t;
1096         void *data;
1097         boolean_t dropped;
1098 
1099         if (lockp != NULL) {
1100                 ASSERT(MUTEX_HELD(lockp));
1101         }
1102         if (zone_lock_held) {
1103                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1104         } else {
1105                 mutex_enter(&zone->zone_lock);
1106         }
1107 
1108         t = zsd_find(&zone->zone_zsd, key);
1109         if (t == NULL) {
1110                 /*
1111                  * Somebody else got here first e.g the zone going
1112                  * away.
1113                  */
1114                 if (!zone_lock_held)
1115                         mutex_exit(&zone->zone_lock);
1116                 return (B_FALSE);
1117         }
1118         dropped = B_FALSE;
1119         if (zsd_wait_for_creator(zone, t, lockp))
1120                 dropped = B_TRUE;
1121 
1122         if (zsd_wait_for_inprogress(zone, t, lockp))
1123                 dropped = B_TRUE;
1124 
1125         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1129                     zone_t *, zone, zone_key_t, key);
1130                 mutex_exit(&zone->zone_lock);
1131                 if (lockp != NULL)
1132                         mutex_exit(lockp);
1133                 dropped = B_TRUE;
1134 
1135                 ASSERT(t->zsd_shutdown != NULL);
1136                 data = t->zsd_data;
1137 
1138                 DTRACE_PROBE2(zsd__shutdown__start,
1139                     zone_t *, zone, zone_key_t, key);
1140 
1141                 (t->zsd_shutdown)(zone->zone_id, data);
1142                 DTRACE_PROBE2(zsd__shutdown__end,
1143                     zone_t *, zone, zone_key_t, key);
1144 
1145                 if (lockp != NULL)
1146                         mutex_enter(lockp);
1147                 mutex_enter(&zone->zone_lock);
1148                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150                 cv_broadcast(&t->zsd_cv);
1151                 DTRACE_PROBE2(zsd__shutdown__completed,
1152                     zone_t *, zone, zone_key_t, key);
1153         }
1154         if (!zone_lock_held)
1155                 mutex_exit(&zone->zone_lock);
1156         return (dropped);
1157 }
1158 
1159 /*
1160  * Call the destroy function for the zone and key if DESTROY_NEEDED
1161  * is set.
1162  * If some other thread gets here first and sets *_INPROGRESS, then
1163  * we wait for that thread to complete so that we can ensure that
1164  * all the callbacks are done when we've looped over all zones/keys.
1165  *
1166  * When we call the destroy function, we drop the global held by the
1167  * caller, and return true to tell the caller it needs to re-evalute the
1168  * state.
1169  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170  * remains held on exit.
1171  */
1172 static boolean_t
1173 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174     zone_t *zone, zone_key_t key)
1175 {
1176         struct zsd_entry *t;
1177         void *data;
1178         boolean_t dropped;
1179 
1180         if (lockp != NULL) {
1181                 ASSERT(MUTEX_HELD(lockp));
1182         }
1183         if (zone_lock_held) {
1184                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1185         } else {
1186                 mutex_enter(&zone->zone_lock);
1187         }
1188 
1189         t = zsd_find(&zone->zone_zsd, key);
1190         if (t == NULL) {
1191                 /*
1192                  * Somebody else got here first e.g the zone going
1193                  * away.
1194                  */
1195                 if (!zone_lock_held)
1196                         mutex_exit(&zone->zone_lock);
1197                 return (B_FALSE);
1198         }
1199         dropped = B_FALSE;
1200         if (zsd_wait_for_creator(zone, t, lockp))
1201                 dropped = B_TRUE;
1202 
1203         if (zsd_wait_for_inprogress(zone, t, lockp))
1204                 dropped = B_TRUE;
1205 
1206         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209                 DTRACE_PROBE2(zsd__destroy__inprogress,
1210                     zone_t *, zone, zone_key_t, key);
1211                 mutex_exit(&zone->zone_lock);
1212                 if (lockp != NULL)
1213                         mutex_exit(lockp);
1214                 dropped = B_TRUE;
1215 
1216                 ASSERT(t->zsd_destroy != NULL);
1217                 data = t->zsd_data;
1218                 DTRACE_PROBE2(zsd__destroy__start,
1219                     zone_t *, zone, zone_key_t, key);
1220 
1221                 (t->zsd_destroy)(zone->zone_id, data);
1222                 DTRACE_PROBE2(zsd__destroy__end,
1223                     zone_t *, zone, zone_key_t, key);
1224 
1225                 if (lockp != NULL)
1226                         mutex_enter(lockp);
1227                 mutex_enter(&zone->zone_lock);
1228                 t->zsd_data = NULL;
1229                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231                 cv_broadcast(&t->zsd_cv);
1232                 DTRACE_PROBE2(zsd__destroy__completed,
1233                     zone_t *, zone, zone_key_t, key);
1234         }
1235         if (!zone_lock_held)
1236                 mutex_exit(&zone->zone_lock);
1237         return (dropped);
1238 }
1239 
1240 /*
1241  * Wait for any CREATE_NEEDED flag to be cleared.
1242  * Returns true if lockp was temporarily dropped while waiting.
1243  */
1244 static boolean_t
1245 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 {
1247         boolean_t dropped = B_FALSE;
1248 
1249         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250                 DTRACE_PROBE2(zsd__wait__for__creator,
1251                     zone_t *, zone, struct zsd_entry *, t);
1252                 if (lockp != NULL) {
1253                         dropped = B_TRUE;
1254                         mutex_exit(lockp);
1255                 }
1256                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1257                 if (lockp != NULL) {
1258                         /* First drop zone_lock to preserve order */
1259                         mutex_exit(&zone->zone_lock);
1260                         mutex_enter(lockp);
1261                         mutex_enter(&zone->zone_lock);
1262                 }
1263         }
1264         return (dropped);
1265 }
1266 
1267 /*
1268  * Wait for any INPROGRESS flag to be cleared.
1269  * Returns true if lockp was temporarily dropped while waiting.
1270  */
1271 static boolean_t
1272 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 {
1274         boolean_t dropped = B_FALSE;
1275 
1276         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1278                     zone_t *, zone, struct zsd_entry *, t);
1279                 if (lockp != NULL) {
1280                         dropped = B_TRUE;
1281                         mutex_exit(lockp);
1282                 }
1283                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1284                 if (lockp != NULL) {
1285                         /* First drop zone_lock to preserve order */
1286                         mutex_exit(&zone->zone_lock);
1287                         mutex_enter(lockp);
1288                         mutex_enter(&zone->zone_lock);
1289                 }
1290         }
1291         return (dropped);
1292 }
1293 
1294 /*
1295  * Frees memory associated with the zone dataset list.
1296  */
1297 static void
1298 zone_free_datasets(zone_t *zone)
1299 {
1300         zone_dataset_t *t, *next;
1301 
1302         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303                 next = list_next(&zone->zone_datasets, t);
1304                 list_remove(&zone->zone_datasets, t);
1305                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306                 kmem_free(t, sizeof (*t));
1307         }
1308         list_destroy(&zone->zone_datasets);
1309 }
1310 
1311 /*
1312  * zone.cpu-shares resource control support.
1313  */
1314 /*ARGSUSED*/
1315 static rctl_qty_t
1316 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 {
1318         ASSERT(MUTEX_HELD(&p->p_lock));
1319         return (p->p_zone->zone_shares);
1320 }
1321 
1322 /*ARGSUSED*/
1323 static int
1324 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325     rctl_qty_t nv)
1326 {
1327         ASSERT(MUTEX_HELD(&p->p_lock));
1328         ASSERT(e->rcep_t == RCENTITY_ZONE);
1329         if (e->rcep_p.zone == NULL)
1330                 return (0);
1331 
1332         e->rcep_p.zone->zone_shares = nv;
1333         return (0);
1334 }
1335 
1336 static rctl_ops_t zone_cpu_shares_ops = {
1337         rcop_no_action,
1338         zone_cpu_shares_usage,
1339         zone_cpu_shares_set,
1340         rcop_no_test
1341 };
1342 
1343 /*
1344  * zone.cpu-cap resource control support.
1345  */
1346 /*ARGSUSED*/
1347 static rctl_qty_t
1348 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 {
1350         ASSERT(MUTEX_HELD(&p->p_lock));
1351         return (cpucaps_zone_get(p->p_zone));
1352 }
1353 
1354 /*ARGSUSED*/
1355 static int
1356 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357     rctl_qty_t nv)
1358 {
1359         zone_t *zone = e->rcep_p.zone;
1360 
1361         ASSERT(MUTEX_HELD(&p->p_lock));
1362         ASSERT(e->rcep_t == RCENTITY_ZONE);
1363 
1364         if (zone == NULL)
1365                 return (0);
1366 
1367         /*
1368          * set cap to the new value.
1369          */
1370         return (cpucaps_zone_set(zone, nv));
1371 }
1372 
1373 static rctl_ops_t zone_cpu_cap_ops = {
1374         rcop_no_action,
1375         zone_cpu_cap_get,
1376         zone_cpu_cap_set,
1377         rcop_no_test
1378 };
1379 
1380 /*ARGSUSED*/
1381 static rctl_qty_t
1382 zone_lwps_usage(rctl_t *r, proc_t *p)
1383 {
1384         rctl_qty_t nlwps;
1385         zone_t *zone = p->p_zone;
1386 
1387         ASSERT(MUTEX_HELD(&p->p_lock));
1388 
1389         mutex_enter(&zone->zone_nlwps_lock);
1390         nlwps = zone->zone_nlwps;
1391         mutex_exit(&zone->zone_nlwps_lock);
1392 
1393         return (nlwps);
1394 }
1395 
1396 /*ARGSUSED*/
1397 static int
1398 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399     rctl_qty_t incr, uint_t flags)
1400 {
1401         rctl_qty_t nlwps;
1402 
1403         ASSERT(MUTEX_HELD(&p->p_lock));
1404         ASSERT(e->rcep_t == RCENTITY_ZONE);
1405         if (e->rcep_p.zone == NULL)
1406                 return (0);
1407         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408         nlwps = e->rcep_p.zone->zone_nlwps;
1409 
1410         if (nlwps + incr > rcntl->rcv_value)
1411                 return (1);
1412 
1413         return (0);
1414 }
1415 
1416 /*ARGSUSED*/
1417 static int
1418 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 {
1420         ASSERT(MUTEX_HELD(&p->p_lock));
1421         ASSERT(e->rcep_t == RCENTITY_ZONE);
1422         if (e->rcep_p.zone == NULL)
1423                 return (0);
1424         e->rcep_p.zone->zone_nlwps_ctl = nv;
1425         return (0);
1426 }
1427 
1428 static rctl_ops_t zone_lwps_ops = {
1429         rcop_no_action,
1430         zone_lwps_usage,
1431         zone_lwps_set,
1432         zone_lwps_test,
1433 };
1434 
1435 /*ARGSUSED*/
1436 static rctl_qty_t
1437 zone_procs_usage(rctl_t *r, proc_t *p)
1438 {
1439         rctl_qty_t nprocs;
1440         zone_t *zone = p->p_zone;
1441 
1442         ASSERT(MUTEX_HELD(&p->p_lock));
1443 
1444         mutex_enter(&zone->zone_nlwps_lock);
1445         nprocs = zone->zone_nprocs;
1446         mutex_exit(&zone->zone_nlwps_lock);
1447 
1448         return (nprocs);
1449 }
1450 
1451 /*ARGSUSED*/
1452 static int
1453 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454     rctl_qty_t incr, uint_t flags)
1455 {
1456         rctl_qty_t nprocs;
1457 
1458         ASSERT(MUTEX_HELD(&p->p_lock));
1459         ASSERT(e->rcep_t == RCENTITY_ZONE);
1460         if (e->rcep_p.zone == NULL)
1461                 return (0);
1462         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463         nprocs = e->rcep_p.zone->zone_nprocs;
1464 
1465         if (nprocs + incr > rcntl->rcv_value)
1466                 return (1);
1467 
1468         return (0);
1469 }
1470 
1471 /*ARGSUSED*/
1472 static int
1473 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 {
1475         ASSERT(MUTEX_HELD(&p->p_lock));
1476         ASSERT(e->rcep_t == RCENTITY_ZONE);
1477         if (e->rcep_p.zone == NULL)
1478                 return (0);
1479         e->rcep_p.zone->zone_nprocs_ctl = nv;
1480         return (0);
1481 }
1482 
1483 static rctl_ops_t zone_procs_ops = {
1484         rcop_no_action,
1485         zone_procs_usage,
1486         zone_procs_set,
1487         zone_procs_test,
1488 };
1489 
1490 /*ARGSUSED*/
1491 static rctl_qty_t
1492 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 {
1494         ASSERT(MUTEX_HELD(&p->p_lock));
1495         return (p->p_zone->zone_shmmax);
1496 }
1497 
1498 /*ARGSUSED*/
1499 static int
1500 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501     rctl_qty_t incr, uint_t flags)
1502 {
1503         rctl_qty_t v;
1504         ASSERT(MUTEX_HELD(&p->p_lock));
1505         ASSERT(e->rcep_t == RCENTITY_ZONE);
1506         v = e->rcep_p.zone->zone_shmmax + incr;
1507         if (v > rval->rcv_value)
1508                 return (1);
1509         return (0);
1510 }
1511 
1512 static rctl_ops_t zone_shmmax_ops = {
1513         rcop_no_action,
1514         zone_shmmax_usage,
1515         rcop_no_set,
1516         zone_shmmax_test
1517 };
1518 
1519 /*ARGSUSED*/
1520 static rctl_qty_t
1521 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 {
1523         ASSERT(MUTEX_HELD(&p->p_lock));
1524         return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 }
1526 
1527 /*ARGSUSED*/
1528 static int
1529 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530     rctl_qty_t incr, uint_t flags)
1531 {
1532         rctl_qty_t v;
1533         ASSERT(MUTEX_HELD(&p->p_lock));
1534         ASSERT(e->rcep_t == RCENTITY_ZONE);
1535         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536         if (v > rval->rcv_value)
1537                 return (1);
1538         return (0);
1539 }
1540 
1541 static rctl_ops_t zone_shmmni_ops = {
1542         rcop_no_action,
1543         zone_shmmni_usage,
1544         rcop_no_set,
1545         zone_shmmni_test
1546 };
1547 
1548 /*ARGSUSED*/
1549 static rctl_qty_t
1550 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 {
1552         ASSERT(MUTEX_HELD(&p->p_lock));
1553         return (p->p_zone->zone_ipc.ipcq_semmni);
1554 }
1555 
1556 /*ARGSUSED*/
1557 static int
1558 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559     rctl_qty_t incr, uint_t flags)
1560 {
1561         rctl_qty_t v;
1562         ASSERT(MUTEX_HELD(&p->p_lock));
1563         ASSERT(e->rcep_t == RCENTITY_ZONE);
1564         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565         if (v > rval->rcv_value)
1566                 return (1);
1567         return (0);
1568 }
1569 
1570 static rctl_ops_t zone_semmni_ops = {
1571         rcop_no_action,
1572         zone_semmni_usage,
1573         rcop_no_set,
1574         zone_semmni_test
1575 };
1576 
1577 /*ARGSUSED*/
1578 static rctl_qty_t
1579 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 {
1581         ASSERT(MUTEX_HELD(&p->p_lock));
1582         return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 }
1584 
1585 /*ARGSUSED*/
1586 static int
1587 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588     rctl_qty_t incr, uint_t flags)
1589 {
1590         rctl_qty_t v;
1591         ASSERT(MUTEX_HELD(&p->p_lock));
1592         ASSERT(e->rcep_t == RCENTITY_ZONE);
1593         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594         if (v > rval->rcv_value)
1595                 return (1);
1596         return (0);
1597 }
1598 
1599 static rctl_ops_t zone_msgmni_ops = {
1600         rcop_no_action,
1601         zone_msgmni_usage,
1602         rcop_no_set,
1603         zone_msgmni_test
1604 };
1605 
1606 /*ARGSUSED*/
1607 static rctl_qty_t
1608 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 {
1610         rctl_qty_t q;
1611         ASSERT(MUTEX_HELD(&p->p_lock));
1612         mutex_enter(&p->p_zone->zone_mem_lock);
1613         q = p->p_zone->zone_locked_mem;
1614         mutex_exit(&p->p_zone->zone_mem_lock);
1615         return (q);
1616 }
1617 
1618 /*ARGSUSED*/
1619 static int
1620 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 {
1623         rctl_qty_t q;
1624         zone_t *z;
1625 
1626         z = e->rcep_p.zone;
1627         ASSERT(MUTEX_HELD(&p->p_lock));
1628         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629         q = z->zone_locked_mem;
1630         if (q + incr > rcntl->rcv_value)
1631                 return (1);
1632         return (0);
1633 }
1634 
1635 /*ARGSUSED*/
1636 static int
1637 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638     rctl_qty_t nv)
1639 {
1640         ASSERT(MUTEX_HELD(&p->p_lock));
1641         ASSERT(e->rcep_t == RCENTITY_ZONE);
1642         if (e->rcep_p.zone == NULL)
1643                 return (0);
1644         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645         return (0);
1646 }
1647 
1648 static rctl_ops_t zone_locked_mem_ops = {
1649         rcop_no_action,
1650         zone_locked_mem_usage,
1651         zone_locked_mem_set,
1652         zone_locked_mem_test
1653 };
1654 
1655 /*ARGSUSED*/
1656 static rctl_qty_t
1657 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 {
1659         rctl_qty_t q;
1660         zone_t *z = p->p_zone;
1661 
1662         ASSERT(MUTEX_HELD(&p->p_lock));
1663         mutex_enter(&z->zone_mem_lock);
1664         q = z->zone_max_swap;
1665         mutex_exit(&z->zone_mem_lock);
1666         return (q);
1667 }
1668 
1669 /*ARGSUSED*/
1670 static int
1671 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 {
1674         rctl_qty_t q;
1675         zone_t *z;
1676 
1677         z = e->rcep_p.zone;
1678         ASSERT(MUTEX_HELD(&p->p_lock));
1679         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680         q = z->zone_max_swap;
1681         if (q + incr > rcntl->rcv_value)
1682                 return (1);
1683         return (0);
1684 }
1685 
1686 /*ARGSUSED*/
1687 static int
1688 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689     rctl_qty_t nv)
1690 {
1691         ASSERT(MUTEX_HELD(&p->p_lock));
1692         ASSERT(e->rcep_t == RCENTITY_ZONE);
1693         if (e->rcep_p.zone == NULL)
1694                 return (0);
1695         e->rcep_p.zone->zone_max_swap_ctl = nv;
1696         return (0);
1697 }
1698 
1699 static rctl_ops_t zone_max_swap_ops = {
1700         rcop_no_action,
1701         zone_max_swap_usage,
1702         zone_max_swap_set,
1703         zone_max_swap_test
1704 };
1705 
1706 /*ARGSUSED*/
1707 static rctl_qty_t
1708 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 {
1710         rctl_qty_t q;
1711         zone_t *z = p->p_zone;
1712 
1713         ASSERT(MUTEX_HELD(&p->p_lock));
1714         mutex_enter(&z->zone_rctl_lock);
1715         q = z->zone_max_lofi;
1716         mutex_exit(&z->zone_rctl_lock);
1717         return (q);
1718 }
1719 
1720 /*ARGSUSED*/
1721 static int
1722 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 {
1725         rctl_qty_t q;
1726         zone_t *z;
1727 
1728         z = e->rcep_p.zone;
1729         ASSERT(MUTEX_HELD(&p->p_lock));
1730         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731         q = z->zone_max_lofi;
1732         if (q + incr > rcntl->rcv_value)
1733                 return (1);
1734         return (0);
1735 }
1736 
1737 /*ARGSUSED*/
1738 static int
1739 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740     rctl_qty_t nv)
1741 {
1742         ASSERT(MUTEX_HELD(&p->p_lock));
1743         ASSERT(e->rcep_t == RCENTITY_ZONE);
1744         if (e->rcep_p.zone == NULL)
1745                 return (0);
1746         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747         return (0);
1748 }
1749 
1750 static rctl_ops_t zone_max_lofi_ops = {
1751         rcop_no_action,
1752         zone_max_lofi_usage,
1753         zone_max_lofi_set,
1754         zone_max_lofi_test
1755 };
1756 
1757 /*
1758  * Helper function to brand the zone with a unique ID.
1759  */
1760 static void
1761 zone_uniqid(zone_t *zone)
1762 {
1763         static uint64_t uniqid = 0;
1764 
1765         ASSERT(MUTEX_HELD(&zonehash_lock));
1766         zone->zone_uniqid = uniqid++;
1767 }
1768 
1769 /*
1770  * Returns a held pointer to the "kcred" for the specified zone.
1771  */
1772 struct cred *
1773 zone_get_kcred(zoneid_t zoneid)
1774 {
1775         zone_t *zone;
1776         cred_t *cr;
1777 
1778         if ((zone = zone_find_by_id(zoneid)) == NULL)
1779                 return (NULL);
1780         cr = zone->zone_kcred;
1781         crhold(cr);
1782         zone_rele(zone);
1783         return (cr);
1784 }
1785 
1786 static int
1787 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 {
1789         zone_t *zone = ksp->ks_private;
1790         zone_kstat_t *zk = ksp->ks_data;
1791 
1792         if (rw == KSTAT_WRITE)
1793                 return (EACCES);
1794 
1795         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797         return (0);
1798 }
1799 
1800 static int
1801 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 {
1803         zone_t *zone = ksp->ks_private;
1804         zone_kstat_t *zk = ksp->ks_data;
1805 
1806         if (rw == KSTAT_WRITE)
1807                 return (EACCES);
1808 
1809         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811         return (0);
1812 }
1813 
1814 static int
1815 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 {
1817         zone_t *zone = ksp->ks_private;
1818         zone_kstat_t *zk = ksp->ks_data;
1819 
1820         if (rw == KSTAT_WRITE)
1821                 return (EACCES);
1822 
1823         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825         return (0);
1826 }
1827 
1828 static kstat_t *
1829 zone_kstat_create_common(zone_t *zone, char *name,
1830     int (*updatefunc) (kstat_t *, int))
1831 {
1832         kstat_t *ksp;
1833         zone_kstat_t *zk;
1834 
1835         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837             KSTAT_FLAG_VIRTUAL);
1838 
1839         if (ksp == NULL)
1840                 return (NULL);
1841 
1842         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848         ksp->ks_update = updatefunc;
1849         ksp->ks_private = zone;
1850         kstat_install(ksp);
1851         return (ksp);
1852 }
1853 
1854 
1855 static int
1856 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 {
1858         zone_t *zone = ksp->ks_private;
1859         zone_mcap_kstat_t *zmp = ksp->ks_data;
1860 
1861         if (rw == KSTAT_WRITE)
1862                 return (EACCES);
1863 
1864         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869 
1870         return (0);
1871 }
1872 
1873 static kstat_t *
1874 zone_mcap_kstat_create(zone_t *zone)
1875 {
1876         kstat_t *ksp;
1877         zone_mcap_kstat_t *zmp;
1878 
1879         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883                 return (NULL);
1884 
1885         if (zone->zone_id != GLOBAL_ZONEID)
1886                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1887 
1888         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890         ksp->ks_lock = &zone->zone_mcap_lock;
1891         zone->zone_mcap_stats = zmp;
1892 
1893         /* The kstat "name" field is not large enough for a full zonename */
1894         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901             KSTAT_DATA_UINT64);
1902 
1903         ksp->ks_update = zone_mcap_kstat_update;
1904         ksp->ks_private = zone;
1905 
1906         kstat_install(ksp);
1907         return (ksp);
1908 }
1909 
1910 static int
1911 zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 {
1913         zone_t *zone = ksp->ks_private;
1914         zone_misc_kstat_t *zmp = ksp->ks_data;
1915         hrtime_t tmp;
1916 
1917         if (rw == KSTAT_WRITE)
1918                 return (EACCES);
1919 
1920         tmp = zone->zone_utime;
1921         scalehrtime(&tmp);
1922         zmp->zm_utime.value.ui64 = tmp;
1923         tmp = zone->zone_stime;
1924         scalehrtime(&tmp);
1925         zmp->zm_stime.value.ui64 = tmp;
1926         tmp = zone->zone_wtime;
1927         scalehrtime(&tmp);
1928         zmp->zm_wtime.value.ui64 = tmp;
1929 
1930         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1931         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1932         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1933 
1934         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1935         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1936         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1937         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1938 
1939         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1940 
1941         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1942         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1943 
1944         return (0);
1945 }
1946 
1947 static kstat_t *
1948 zone_misc_kstat_create(zone_t *zone)
1949 {
1950         kstat_t *ksp;
1951         zone_misc_kstat_t *zmp;
1952 
1953         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1954             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1955             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1956             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1957                 return (NULL);
1958 
1959         if (zone->zone_id != GLOBAL_ZONEID)
1960                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1961 
1962         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1963         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1964         ksp->ks_lock = &zone->zone_misc_lock;
1965         zone->zone_misc_stats = zmp;
1966 
1967         /* The kstat "name" field is not large enough for a full zonename */
1968         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1969         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1970         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1971         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1972         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1973         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1974         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1975         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1976             KSTAT_DATA_UINT32);
1977         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1978         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1979             KSTAT_DATA_UINT32);
1980         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1982         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1983             KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1985         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1986 
1987         ksp->ks_update = zone_misc_kstat_update;
1988         ksp->ks_private = zone;
1989 
1990         kstat_install(ksp);
1991         return (ksp);
1992 }
1993 
1994 static void
1995 zone_kstat_create(zone_t *zone)
1996 {
1997         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1998             "lockedmem", zone_lockedmem_kstat_update);
1999         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2000             "swapresv", zone_swapresv_kstat_update);
2001         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2002             "nprocs", zone_nprocs_kstat_update);
2003 
2004         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2005                 zone->zone_mcap_stats = kmem_zalloc(
2006                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2007         }
2008 
2009         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2010                 zone->zone_misc_stats = kmem_zalloc(
2011                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2012         }
2013 }
2014 
2015 static void
2016 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2017 {
2018         void *data;
2019 
2020         if (*pkstat != NULL) {
2021                 data = (*pkstat)->ks_data;
2022                 kstat_delete(*pkstat);
2023                 kmem_free(data, datasz);
2024                 *pkstat = NULL;
2025         }
2026 }
2027 
2028 static void
2029 zone_kstat_delete(zone_t *zone)
2030 {
2031         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2032             sizeof (zone_kstat_t));
2033         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2034             sizeof (zone_kstat_t));
2035         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2036             sizeof (zone_kstat_t));
2037         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2038             sizeof (zone_mcap_kstat_t));
2039         zone_kstat_delete_common(&zone->zone_misc_ksp,
2040             sizeof (zone_misc_kstat_t));
2041 }
2042 
2043 /*
2044  * Called very early on in boot to initialize the ZSD list so that
2045  * zone_key_create() can be called before zone_init().  It also initializes
2046  * portions of zone0 which may be used before zone_init() is called.  The
2047  * variable "global_zone" will be set when zone0 is fully initialized by
2048  * zone_init().
2049  */
2050 void
2051 zone_zsd_init(void)
2052 {
2053         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2054         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2055         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2056             offsetof(struct zsd_entry, zsd_linkage));
2057         list_create(&zone_active, sizeof (zone_t),
2058             offsetof(zone_t, zone_linkage));
2059         list_create(&zone_deathrow, sizeof (zone_t),
2060             offsetof(zone_t, zone_linkage));
2061 
2062         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2063         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2064         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2065         zone0.zone_shares = 1;
2066         zone0.zone_nlwps = 0;
2067         zone0.zone_nlwps_ctl = INT_MAX;
2068         zone0.zone_nprocs = 0;
2069         zone0.zone_nprocs_ctl = INT_MAX;
2070         zone0.zone_locked_mem = 0;
2071         zone0.zone_locked_mem_ctl = UINT64_MAX;
2072         ASSERT(zone0.zone_max_swap == 0);
2073         zone0.zone_max_swap_ctl = UINT64_MAX;
2074         zone0.zone_max_lofi = 0;
2075         zone0.zone_max_lofi_ctl = UINT64_MAX;
2076         zone0.zone_shmmax = 0;
2077         zone0.zone_ipc.ipcq_shmmni = 0;
2078         zone0.zone_ipc.ipcq_semmni = 0;
2079         zone0.zone_ipc.ipcq_msgmni = 0;
2080         zone0.zone_name = GLOBAL_ZONENAME;
2081         zone0.zone_nodename = utsname.nodename;
2082         zone0.zone_domain = srpc_domain;
2083         zone0.zone_hostid = HW_INVALID_HOSTID;
2084         zone0.zone_fs_allowed = NULL;
2085         psecflags_default(&zone0.zone_secflags);
2086         zone0.zone_ref = 1;
2087         zone0.zone_id = GLOBAL_ZONEID;
2088         zone0.zone_status = ZONE_IS_RUNNING;
2089         zone0.zone_rootpath = "/";
2090         zone0.zone_rootpathlen = 2;
2091         zone0.zone_psetid = ZONE_PS_INVAL;
2092         zone0.zone_ncpus = 0;
2093         zone0.zone_ncpus_online = 0;
2094         zone0.zone_proc_initpid = 1;
2095         zone0.zone_initname = initname;
2096         zone0.zone_lockedmem_kstat = NULL;
2097         zone0.zone_swapresv_kstat = NULL;
2098         zone0.zone_nprocs_kstat = NULL;
2099 
2100         zone0.zone_stime = 0;
2101         zone0.zone_utime = 0;
2102         zone0.zone_wtime = 0;
2103 
2104         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2105             offsetof(zone_ref_t, zref_linkage));
2106         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2107             offsetof(struct zsd_entry, zsd_linkage));
2108         list_insert_head(&zone_active, &zone0);
2109 
2110         /*
2111          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2112          * to anything meaningful.  It is assigned to be 'rootdir' in
2113          * vfs_mountroot().
2114          */
2115         zone0.zone_rootvp = NULL;
2116         zone0.zone_vfslist = NULL;
2117         zone0.zone_bootargs = initargs;
2118         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2119         /*
2120          * The global zone has all privileges
2121          */
2122         priv_fillset(zone0.zone_privset);
2123         /*
2124          * Add p0 to the global zone
2125          */
2126         zone0.zone_zsched = &p0;
2127         p0.p_zone = &zone0;
2128 }
2129 
2130 /*
2131  * Compute a hash value based on the contents of the label and the DOI.  The
2132  * hash algorithm is somewhat arbitrary, but is based on the observation that
2133  * humans will likely pick labels that differ by amounts that work out to be
2134  * multiples of the number of hash chains, and thus stirring in some primes
2135  * should help.
2136  */
2137 static uint_t
2138 hash_bylabel(void *hdata, mod_hash_key_t key)
2139 {
2140         const ts_label_t *lab = (ts_label_t *)key;
2141         const uint32_t *up, *ue;
2142         uint_t hash;
2143         int i;
2144 
2145         _NOTE(ARGUNUSED(hdata));
2146 
2147         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2148         /* we depend on alignment of label, but not representation */
2149         up = (const uint32_t *)&lab->tsl_label;
2150         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2151         i = 1;
2152         while (up < ue) {
2153                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2154                 hash += *up + (*up << ((i % 16) + 1));
2155                 up++;
2156                 i++;
2157         }
2158         return (hash);
2159 }
2160 
2161 /*
2162  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2163  * equal).  This may need to be changed if less than / greater than is ever
2164  * needed.
2165  */
2166 static int
2167 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2168 {
2169         ts_label_t *lab1 = (ts_label_t *)key1;
2170         ts_label_t *lab2 = (ts_label_t *)key2;
2171 
2172         return (label_equal(lab1, lab2) ? 0 : 1);
2173 }
2174 
2175 /*
2176  * Called by main() to initialize the zones framework.
2177  */
2178 void
2179 zone_init(void)
2180 {
2181         rctl_dict_entry_t *rde;
2182         rctl_val_t *dval;
2183         rctl_set_t *set;
2184         rctl_alloc_gp_t *gp;
2185         rctl_entity_p_t e;
2186         int res;
2187 
2188         ASSERT(curproc == &p0);
2189 
2190         /*
2191          * Create ID space for zone IDs.  ID 0 is reserved for the
2192          * global zone.
2193          */
2194         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2195 
2196         /*
2197          * Initialize generic zone resource controls, if any.
2198          */
2199         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2200             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2201             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2202             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2203 
2204         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2205             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2206             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2207             RCTL_GLOBAL_INFINITE,
2208             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2209 
2210         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2211             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2212             INT_MAX, INT_MAX, &zone_lwps_ops);
2213 
2214         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2215             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2216             INT_MAX, INT_MAX, &zone_procs_ops);
2217 
2218         /*
2219          * System V IPC resource controls
2220          */
2221         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2222             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2223             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2224 
2225         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2226             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2227             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2228 
2229         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2230             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2231             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2232 
2233         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2234             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2235             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2236 
2237         /*
2238          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2239          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2240          */
2241         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2242         bzero(dval, sizeof (rctl_val_t));
2243         dval->rcv_value = 1;
2244         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2245         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2246         dval->rcv_action_recip_pid = -1;
2247 
2248         rde = rctl_dict_lookup("zone.cpu-shares");
2249         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2250 
2251         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2252             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2253             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2254             &zone_locked_mem_ops);
2255 
2256         rc_zone_max_swap = rctl_register("zone.max-swap",
2257             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2258             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2259             &zone_max_swap_ops);
2260 
2261         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2262             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2263             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2264             &zone_max_lofi_ops);
2265 
2266         /*
2267          * Initialize the ``global zone''.
2268          */
2269         set = rctl_set_create();
2270         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2271         mutex_enter(&p0.p_lock);
2272         e.rcep_p.zone = &zone0;
2273         e.rcep_t = RCENTITY_ZONE;
2274         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2275             gp);
2276 
2277         zone0.zone_nlwps = p0.p_lwpcnt;
2278         zone0.zone_nprocs = 1;
2279         zone0.zone_ntasks = 1;
2280         mutex_exit(&p0.p_lock);
2281         zone0.zone_restart_init = B_TRUE;
2282         zone0.zone_brand = &native_brand;
2283         rctl_prealloc_destroy(gp);
2284         /*
2285          * pool_default hasn't been initialized yet, so we let pool_init()
2286          * take care of making sure the global zone is in the default pool.
2287          */
2288 
2289         /*
2290          * Initialize global zone kstats
2291          */
2292         zone_kstat_create(&zone0);
2293 
2294         /*
2295          * Initialize zone label.
2296          * mlp are initialized when tnzonecfg is loaded.
2297          */
2298         zone0.zone_slabel = l_admin_low;
2299         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2300         label_hold(l_admin_low);
2301 
2302         /*
2303          * Initialise the lock for the database structure used by mntfs.
2304          */
2305         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2306 
2307         mutex_enter(&zonehash_lock);
2308         zone_uniqid(&zone0);
2309         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2310 
2311         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2312             mod_hash_null_valdtor);
2313         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2314             zone_hash_size, mod_hash_null_valdtor);
2315         /*
2316          * maintain zonehashbylabel only for labeled systems
2317          */
2318         if (is_system_labeled())
2319                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2320                     zone_hash_size, mod_hash_null_keydtor,
2321                     mod_hash_null_valdtor, hash_bylabel, NULL,
2322                     hash_labelkey_cmp, KM_SLEEP);
2323         zonecount = 1;
2324 
2325         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2326             (mod_hash_val_t)&zone0);
2327         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2328             (mod_hash_val_t)&zone0);
2329         if (is_system_labeled()) {
2330                 zone0.zone_flags |= ZF_HASHED_LABEL;
2331                 (void) mod_hash_insert(zonehashbylabel,
2332                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2333         }
2334         mutex_exit(&zonehash_lock);
2335 
2336         /*
2337          * We avoid setting zone_kcred until now, since kcred is initialized
2338          * sometime after zone_zsd_init() and before zone_init().
2339          */
2340         zone0.zone_kcred = kcred;
2341         /*
2342          * The global zone is fully initialized (except for zone_rootvp which
2343          * will be set when the root filesystem is mounted).
2344          */
2345         global_zone = &zone0;
2346 
2347         /*
2348          * Setup an event channel to send zone status change notifications on
2349          */
2350         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2351             EVCH_CREAT);
2352 
2353         if (res)
2354                 panic("Sysevent_evc_bind failed during zone setup.\n");
2355 
2356 }
2357 
2358 static void
2359 zone_free(zone_t *zone)
2360 {
2361         ASSERT(zone != global_zone);
2362         ASSERT(zone->zone_ntasks == 0);
2363         ASSERT(zone->zone_nlwps == 0);
2364         ASSERT(zone->zone_nprocs == 0);
2365         ASSERT(zone->zone_cred_ref == 0);
2366         ASSERT(zone->zone_kcred == NULL);
2367         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2368             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2369         ASSERT(list_is_empty(&zone->zone_ref_list));
2370 
2371         /*
2372          * Remove any zone caps.
2373          */
2374         cpucaps_zone_remove(zone);
2375 
2376         ASSERT(zone->zone_cpucap == NULL);
2377 
2378         /* remove from deathrow list */
2379         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2380                 ASSERT(zone->zone_ref == 0);
2381                 mutex_enter(&zone_deathrow_lock);
2382                 list_remove(&zone_deathrow, zone);
2383                 mutex_exit(&zone_deathrow_lock);
2384         }
2385 
2386         list_destroy(&zone->zone_ref_list);
2387         zone_free_zsd(zone);
2388         zone_free_datasets(zone);
2389         list_destroy(&zone->zone_dl_list);
2390 
2391         if (zone->zone_rootvp != NULL)
2392                 VN_RELE(zone->zone_rootvp);
2393         if (zone->zone_rootpath)
2394                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2395         if (zone->zone_name != NULL)
2396                 kmem_free(zone->zone_name, ZONENAME_MAX);
2397         if (zone->zone_slabel != NULL)
2398                 label_rele(zone->zone_slabel);
2399         if (zone->zone_nodename != NULL)
2400                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2401         if (zone->zone_domain != NULL)
2402                 kmem_free(zone->zone_domain, _SYS_NMLN);
2403         if (zone->zone_privset != NULL)
2404                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2405         if (zone->zone_rctls != NULL)
2406                 rctl_set_free(zone->zone_rctls);
2407         if (zone->zone_bootargs != NULL)
2408                 strfree(zone->zone_bootargs);
2409         if (zone->zone_initname != NULL)
2410                 strfree(zone->zone_initname);
2411         if (zone->zone_fs_allowed != NULL)
2412                 strfree(zone->zone_fs_allowed);
2413         if (zone->zone_pfexecd != NULL)
2414                 klpd_freelist(&zone->zone_pfexecd);
2415         id_free(zoneid_space, zone->zone_id);
2416         mutex_destroy(&zone->zone_lock);
2417         cv_destroy(&zone->zone_cv);
2418         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2419         rw_destroy(&zone->zone_mntfs_db_lock);
2420         kmem_free(zone, sizeof (zone_t));
2421 }
2422 
2423 /*
2424  * See block comment at the top of this file for information about zone
2425  * status values.
2426  */
2427 /*
2428  * Convenience function for setting zone status.
2429  */
2430 static void
2431 zone_status_set(zone_t *zone, zone_status_t status)
2432 {
2433 
2434         nvlist_t *nvl = NULL;
2435         ASSERT(MUTEX_HELD(&zone_status_lock));
2436         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2437             status >= zone_status_get(zone));
2438 
2439         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2440             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2441             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2442             zone_status_table[status]) ||
2443             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2444             zone_status_table[zone->zone_status]) ||
2445             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2446             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2447             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2448             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2449 #ifdef DEBUG
2450                 (void) printf(
2451                     "Failed to allocate and send zone state change event.\n");
2452 #endif
2453         }
2454         nvlist_free(nvl);
2455 
2456         zone->zone_status = status;
2457 
2458         cv_broadcast(&zone->zone_cv);
2459 }
2460 
2461 /*
2462  * Public function to retrieve the zone status.  The zone status may
2463  * change after it is retrieved.
2464  */
2465 zone_status_t
2466 zone_status_get(zone_t *zone)
2467 {
2468         return (zone->zone_status);
2469 }
2470 
2471 static int
2472 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2473 {
2474         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2475         int err = 0;
2476 
2477         ASSERT(zone != global_zone);
2478         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2479                 goto done;      /* EFAULT or ENAMETOOLONG */
2480 
2481         if (zone->zone_bootargs != NULL)
2482                 strfree(zone->zone_bootargs);
2483 
2484         zone->zone_bootargs = strdup(buf);
2485 
2486 done:
2487         kmem_free(buf, BOOTARGS_MAX);
2488         return (err);
2489 }
2490 
2491 static int
2492 zone_set_brand(zone_t *zone, const char *brand)
2493 {
2494         struct brand_attr *attrp;
2495         brand_t *bp;
2496 
2497         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2498         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2499                 kmem_free(attrp, sizeof (struct brand_attr));
2500                 return (EFAULT);
2501         }
2502 
2503         bp = brand_register_zone(attrp);
2504         kmem_free(attrp, sizeof (struct brand_attr));
2505         if (bp == NULL)
2506                 return (EINVAL);
2507 
2508         /*
2509          * This is the only place where a zone can change it's brand.
2510          * We already need to hold zone_status_lock to check the zone
2511          * status, so we'll just use that lock to serialize zone
2512          * branding requests as well.
2513          */
2514         mutex_enter(&zone_status_lock);
2515 
2516         /* Re-Branding is not allowed and the zone can't be booted yet */
2517         if ((ZONE_IS_BRANDED(zone)) ||
2518             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2519                 mutex_exit(&zone_status_lock);
2520                 brand_unregister_zone(bp);
2521                 return (EINVAL);
2522         }
2523 
2524         /* set up the brand specific data */
2525         zone->zone_brand = bp;
2526         ZBROP(zone)->b_init_brand_data(zone);
2527 
2528         mutex_exit(&zone_status_lock);
2529         return (0);
2530 }
2531 
2532 static int
2533 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2534 {
2535         int err = 0;
2536         psecflags_t psf;
2537 
2538         ASSERT(zone != global_zone);
2539 
2540         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2541                 return (err);
2542 
2543         if (zone_status_get(zone) > ZONE_IS_READY)
2544                 return (EINVAL);
2545 
2546         if (!psecflags_validate(&psf))
2547                 return (EINVAL);
2548 
2549         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2550 
2551         /* Set security flags on the zone's zsched */
2552         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2553             sizeof (zone->zone_zsched->p_secflags));
2554 
2555         return (0);
2556 }
2557 
2558 static int
2559 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2560 {
2561         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2562         int err = 0;
2563 
2564         ASSERT(zone != global_zone);
2565         if ((err = copyinstr(zone_fs_allowed, buf,
2566             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2567                 goto done;
2568 
2569         if (zone->zone_fs_allowed != NULL)
2570                 strfree(zone->zone_fs_allowed);
2571 
2572         zone->zone_fs_allowed = strdup(buf);
2573 
2574 done:
2575         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2576         return (err);
2577 }
2578 
2579 static int
2580 zone_set_initname(zone_t *zone, const char *zone_initname)
2581 {
2582         char initname[INITNAME_SZ];
2583         size_t len;
2584         int err = 0;
2585 
2586         ASSERT(zone != global_zone);
2587         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2588                 return (err);   /* EFAULT or ENAMETOOLONG */
2589 
2590         if (zone->zone_initname != NULL)
2591                 strfree(zone->zone_initname);
2592 
2593         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2594         (void) strcpy(zone->zone_initname, initname);
2595         return (0);
2596 }
2597 
2598 static int
2599 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2600 {
2601         uint64_t mcap;
2602         int err = 0;
2603 
2604         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2605                 zone->zone_phys_mcap = mcap;
2606 
2607         return (err);
2608 }
2609 
2610 static int
2611 zone_set_sched_class(zone_t *zone, const char *new_class)
2612 {
2613         char sched_class[PC_CLNMSZ];
2614         id_t classid;
2615         int err;
2616 
2617         ASSERT(zone != global_zone);
2618         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2619                 return (err);   /* EFAULT or ENAMETOOLONG */
2620 
2621         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2622                 return (set_errno(EINVAL));
2623         zone->zone_defaultcid = classid;
2624         ASSERT(zone->zone_defaultcid > 0 &&
2625             zone->zone_defaultcid < loaded_classes);
2626 
2627         return (0);
2628 }
2629 
2630 /*
2631  * Block indefinitely waiting for (zone_status >= status)
2632  */
2633 void
2634 zone_status_wait(zone_t *zone, zone_status_t status)
2635 {
2636         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2637 
2638         mutex_enter(&zone_status_lock);
2639         while (zone->zone_status < status) {
2640                 cv_wait(&zone->zone_cv, &zone_status_lock);
2641         }
2642         mutex_exit(&zone_status_lock);
2643 }
2644 
2645 /*
2646  * Private CPR-safe version of zone_status_wait().
2647  */
2648 static void
2649 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2650 {
2651         callb_cpr_t cprinfo;
2652 
2653         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2654 
2655         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2656             str);
2657         mutex_enter(&zone_status_lock);
2658         while (zone->zone_status < status) {
2659                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2660                 cv_wait(&zone->zone_cv, &zone_status_lock);
2661                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2662         }
2663         /*
2664          * zone_status_lock is implicitly released by the following.
2665          */
2666         CALLB_CPR_EXIT(&cprinfo);
2667 }
2668 
2669 /*
2670  * Block until zone enters requested state or signal is received.  Return (0)
2671  * if signaled, non-zero otherwise.
2672  */
2673 int
2674 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2675 {
2676         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2677 
2678         mutex_enter(&zone_status_lock);
2679         while (zone->zone_status < status) {
2680                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2681                         mutex_exit(&zone_status_lock);
2682                         return (0);
2683                 }
2684         }
2685         mutex_exit(&zone_status_lock);
2686         return (1);
2687 }
2688 
2689 /*
2690  * Block until the zone enters the requested state or the timeout expires,
2691  * whichever happens first.  Return (-1) if operation timed out, time remaining
2692  * otherwise.
2693  */
2694 clock_t
2695 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2696 {
2697         clock_t timeleft = 0;
2698 
2699         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2700 
2701         mutex_enter(&zone_status_lock);
2702         while (zone->zone_status < status && timeleft != -1) {
2703                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2704         }
2705         mutex_exit(&zone_status_lock);
2706         return (timeleft);
2707 }
2708 
2709 /*
2710  * Block until the zone enters the requested state, the current process is
2711  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2712  * operation timed out, 0 if signaled, time remaining otherwise.
2713  */
2714 clock_t
2715 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2716 {
2717         clock_t timeleft = tim - ddi_get_lbolt();
2718 
2719         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2720 
2721         mutex_enter(&zone_status_lock);
2722         while (zone->zone_status < status) {
2723                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2724                     tim);
2725                 if (timeleft <= 0)
2726                         break;
2727         }
2728         mutex_exit(&zone_status_lock);
2729         return (timeleft);
2730 }
2731 
2732 /*
2733  * Zones have two reference counts: one for references from credential
2734  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2735  * This is so we can allow a zone to be rebooted while there are still
2736  * outstanding cred references, since certain drivers cache dblks (which
2737  * implicitly results in cached creds).  We wait for zone_ref to drop to
2738  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2739  * later freed when the zone_cred_ref drops to 0, though nothing other
2740  * than the zone id and privilege set should be accessed once the zone
2741  * is "dead".
2742  *
2743  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2744  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2745  * to 0.  This can be useful to flush out other sources of cached creds
2746  * that may be less innocuous than the driver case.
2747  *
2748  * Zones also provide a tracked reference counting mechanism in which zone
2749  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2750  * debuggers determine the sources of leaked zone references.  See
2751  * zone_hold_ref() and zone_rele_ref() below for more information.
2752  */
2753 
2754 int zone_wait_for_cred = 0;
2755 
2756 static void
2757 zone_hold_locked(zone_t *z)
2758 {
2759         ASSERT(MUTEX_HELD(&z->zone_lock));
2760         z->zone_ref++;
2761         ASSERT(z->zone_ref != 0);
2762 }
2763 
2764 /*
2765  * Increment the specified zone's reference count.  The zone's zone_t structure
2766  * will not be freed as long as the zone's reference count is nonzero.
2767  * Decrement the zone's reference count via zone_rele().
2768  *
2769  * NOTE: This function should only be used to hold zones for short periods of
2770  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2771  */
2772 void
2773 zone_hold(zone_t *z)
2774 {
2775         mutex_enter(&z->zone_lock);
2776         zone_hold_locked(z);
2777         mutex_exit(&z->zone_lock);
2778 }
2779 
2780 /*
2781  * If the non-cred ref count drops to 1 and either the cred ref count
2782  * is 0 or we aren't waiting for cred references, the zone is ready to
2783  * be destroyed.
2784  */
2785 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2786             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2787 
2788 /*
2789  * Common zone reference release function invoked by zone_rele() and
2790  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2791  * zone's subsystem-specific reference counters are not affected by the
2792  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2793  * removed from the specified zone's reference list.  ref must be non-NULL iff
2794  * subsys is not ZONE_REF_NUM_SUBSYS.
2795  */
2796 static void
2797 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2798 {
2799         boolean_t wakeup;
2800 
2801         mutex_enter(&z->zone_lock);
2802         ASSERT(z->zone_ref != 0);
2803         z->zone_ref--;
2804         if (subsys != ZONE_REF_NUM_SUBSYS) {
2805                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2806                 z->zone_subsys_ref[subsys]--;
2807                 list_remove(&z->zone_ref_list, ref);
2808         }
2809         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2810                 /* no more refs, free the structure */
2811                 mutex_exit(&z->zone_lock);
2812                 zone_free(z);
2813                 return;
2814         }
2815         /* signal zone_destroy so the zone can finish halting */
2816         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2817         mutex_exit(&z->zone_lock);
2818 
2819         if (wakeup) {
2820                 /*
2821                  * Grabbing zonehash_lock here effectively synchronizes with
2822                  * zone_destroy() to avoid missed signals.
2823                  */
2824                 mutex_enter(&zonehash_lock);
2825                 cv_broadcast(&zone_destroy_cv);
2826                 mutex_exit(&zonehash_lock);
2827         }
2828 }
2829 
2830 /*
2831  * Decrement the specified zone's reference count.  The specified zone will
2832  * cease to exist after this function returns if the reference count drops to
2833  * zero.  This function should be paired with zone_hold().
2834  */
2835 void
2836 zone_rele(zone_t *z)
2837 {
2838         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2839 }
2840 
2841 /*
2842  * Initialize a zone reference structure.  This function must be invoked for
2843  * a reference structure before the structure is passed to zone_hold_ref().
2844  */
2845 void
2846 zone_init_ref(zone_ref_t *ref)
2847 {
2848         ref->zref_zone = NULL;
2849         list_link_init(&ref->zref_linkage);
2850 }
2851 
2852 /*
2853  * Acquire a reference to zone z.  The caller must specify the
2854  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2855  * zone_ref_t structure will represent a reference to the specified zone.  Use
2856  * zone_rele_ref() to release the reference.
2857  *
2858  * The referenced zone_t structure will not be freed as long as the zone_t's
2859  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2860  * references.
2861  *
2862  * NOTE: The zone_ref_t structure must be initialized before it is used.
2863  * See zone_init_ref() above.
2864  */
2865 void
2866 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2867 {
2868         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2869 
2870         /*
2871          * Prevent consumers from reusing a reference structure before
2872          * releasing it.
2873          */
2874         VERIFY(ref->zref_zone == NULL);
2875 
2876         ref->zref_zone = z;
2877         mutex_enter(&z->zone_lock);
2878         zone_hold_locked(z);
2879         z->zone_subsys_ref[subsys]++;
2880         ASSERT(z->zone_subsys_ref[subsys] != 0);
2881         list_insert_head(&z->zone_ref_list, ref);
2882         mutex_exit(&z->zone_lock);
2883 }
2884 
2885 /*
2886  * Release the zone reference represented by the specified zone_ref_t.
2887  * The reference is invalid after it's released; however, the zone_ref_t
2888  * structure can be reused without having to invoke zone_init_ref().
2889  * subsys should be the same value that was passed to zone_hold_ref()
2890  * when the reference was acquired.
2891  */
2892 void
2893 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2894 {
2895         zone_rele_common(ref->zref_zone, ref, subsys);
2896 
2897         /*
2898          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2899          * when consumers dereference the reference.  This helps us catch
2900          * consumers who use released references.  Furthermore, this lets
2901          * consumers reuse the zone_ref_t structure without having to
2902          * invoke zone_init_ref().
2903          */
2904         ref->zref_zone = NULL;
2905 }
2906 
2907 void
2908 zone_cred_hold(zone_t *z)
2909 {
2910         mutex_enter(&z->zone_lock);
2911         z->zone_cred_ref++;
2912         ASSERT(z->zone_cred_ref != 0);
2913         mutex_exit(&z->zone_lock);
2914 }
2915 
2916 void
2917 zone_cred_rele(zone_t *z)
2918 {
2919         boolean_t wakeup;
2920 
2921         mutex_enter(&z->zone_lock);
2922         ASSERT(z->zone_cred_ref != 0);
2923         z->zone_cred_ref--;
2924         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2925                 /* no more refs, free the structure */
2926                 mutex_exit(&z->zone_lock);
2927                 zone_free(z);
2928                 return;
2929         }
2930         /*
2931          * If zone_destroy is waiting for the cred references to drain
2932          * out, and they have, signal it.
2933          */
2934         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2935             zone_status_get(z) >= ZONE_IS_DEAD);
2936         mutex_exit(&z->zone_lock);
2937 
2938         if (wakeup) {
2939                 /*
2940                  * Grabbing zonehash_lock here effectively synchronizes with
2941                  * zone_destroy() to avoid missed signals.
2942                  */
2943                 mutex_enter(&zonehash_lock);
2944                 cv_broadcast(&zone_destroy_cv);
2945                 mutex_exit(&zonehash_lock);
2946         }
2947 }
2948 
2949 void
2950 zone_task_hold(zone_t *z)
2951 {
2952         mutex_enter(&z->zone_lock);
2953         z->zone_ntasks++;
2954         ASSERT(z->zone_ntasks != 0);
2955         mutex_exit(&z->zone_lock);
2956 }
2957 
2958 void
2959 zone_task_rele(zone_t *zone)
2960 {
2961         uint_t refcnt;
2962 
2963         mutex_enter(&zone->zone_lock);
2964         ASSERT(zone->zone_ntasks != 0);
2965         refcnt = --zone->zone_ntasks;
2966         if (refcnt > 1)      {       /* Common case */
2967                 mutex_exit(&zone->zone_lock);
2968                 return;
2969         }
2970         zone_hold_locked(zone); /* so we can use the zone_t later */
2971         mutex_exit(&zone->zone_lock);
2972         if (refcnt == 1) {
2973                 /*
2974                  * See if the zone is shutting down.
2975                  */
2976                 mutex_enter(&zone_status_lock);
2977                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2978                         goto out;
2979                 }
2980 
2981                 /*
2982                  * Make sure the ntasks didn't change since we
2983                  * dropped zone_lock.
2984                  */
2985                 mutex_enter(&zone->zone_lock);
2986                 if (refcnt != zone->zone_ntasks) {
2987                         mutex_exit(&zone->zone_lock);
2988                         goto out;
2989                 }
2990                 mutex_exit(&zone->zone_lock);
2991 
2992                 /*
2993                  * No more user processes in the zone.  The zone is empty.
2994                  */
2995                 zone_status_set(zone, ZONE_IS_EMPTY);
2996                 goto out;
2997         }
2998 
2999         ASSERT(refcnt == 0);
3000         /*
3001          * zsched has exited; the zone is dead.
3002          */
3003         zone->zone_zsched = NULL;            /* paranoia */
3004         mutex_enter(&zone_status_lock);
3005         zone_status_set(zone, ZONE_IS_DEAD);
3006 out:
3007         mutex_exit(&zone_status_lock);
3008         zone_rele(zone);
3009 }
3010 
3011 zoneid_t
3012 getzoneid(void)
3013 {
3014         return (curproc->p_zone->zone_id);
3015 }
3016 
3017 /*
3018  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3019  * check the validity of a zone's state.
3020  */
3021 static zone_t *
3022 zone_find_all_by_id(zoneid_t zoneid)
3023 {
3024         mod_hash_val_t hv;
3025         zone_t *zone = NULL;
3026 
3027         ASSERT(MUTEX_HELD(&zonehash_lock));
3028 
3029         if (mod_hash_find(zonehashbyid,
3030             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3031                 zone = (zone_t *)hv;
3032         return (zone);
3033 }
3034 
3035 static zone_t *
3036 zone_find_all_by_label(const ts_label_t *label)
3037 {
3038         mod_hash_val_t hv;
3039         zone_t *zone = NULL;
3040 
3041         ASSERT(MUTEX_HELD(&zonehash_lock));
3042 
3043         /*
3044          * zonehashbylabel is not maintained for unlabeled systems
3045          */
3046         if (!is_system_labeled())
3047                 return (NULL);
3048         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3049                 zone = (zone_t *)hv;
3050         return (zone);
3051 }
3052 
3053 static zone_t *
3054 zone_find_all_by_name(char *name)
3055 {
3056         mod_hash_val_t hv;
3057         zone_t *zone = NULL;
3058 
3059         ASSERT(MUTEX_HELD(&zonehash_lock));
3060 
3061         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3062                 zone = (zone_t *)hv;
3063         return (zone);
3064 }
3065 
3066 /*
3067  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3068  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3069  * Caller must call zone_rele() once it is done with the zone.
3070  *
3071  * The zone may begin the zone_destroy() sequence immediately after this
3072  * function returns, but may be safely used until zone_rele() is called.
3073  */
3074 zone_t *
3075 zone_find_by_id(zoneid_t zoneid)
3076 {
3077         zone_t *zone;
3078         zone_status_t status;
3079 
3080         mutex_enter(&zonehash_lock);
3081         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3082                 mutex_exit(&zonehash_lock);
3083                 return (NULL);
3084         }
3085         status = zone_status_get(zone);
3086         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3087                 /*
3088                  * For all practical purposes the zone doesn't exist.
3089                  */
3090                 mutex_exit(&zonehash_lock);
3091                 return (NULL);
3092         }
3093         zone_hold(zone);
3094         mutex_exit(&zonehash_lock);
3095         return (zone);
3096 }
3097 
3098 /*
3099  * Similar to zone_find_by_id, but using zone label as the key.
3100  */
3101 zone_t *
3102 zone_find_by_label(const ts_label_t *label)
3103 {
3104         zone_t *zone;
3105         zone_status_t status;
3106 
3107         mutex_enter(&zonehash_lock);
3108         if ((zone = zone_find_all_by_label(label)) == NULL) {
3109                 mutex_exit(&zonehash_lock);
3110                 return (NULL);
3111         }
3112 
3113         status = zone_status_get(zone);
3114         if (status > ZONE_IS_DOWN) {
3115                 /*
3116                  * For all practical purposes the zone doesn't exist.
3117                  */
3118                 mutex_exit(&zonehash_lock);
3119                 return (NULL);
3120         }
3121         zone_hold(zone);
3122         mutex_exit(&zonehash_lock);
3123         return (zone);
3124 }
3125 
3126 /*
3127  * Similar to zone_find_by_id, but using zone name as the key.
3128  */
3129 zone_t *
3130 zone_find_by_name(char *name)
3131 {
3132         zone_t *zone;
3133         zone_status_t status;
3134 
3135         mutex_enter(&zonehash_lock);
3136         if ((zone = zone_find_all_by_name(name)) == NULL) {
3137                 mutex_exit(&zonehash_lock);
3138                 return (NULL);
3139         }
3140         status = zone_status_get(zone);
3141         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3142                 /*
3143                  * For all practical purposes the zone doesn't exist.
3144                  */
3145                 mutex_exit(&zonehash_lock);
3146                 return (NULL);
3147         }
3148         zone_hold(zone);
3149         mutex_exit(&zonehash_lock);
3150         return (zone);
3151 }
3152 
3153 /*
3154  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3155  * if there is a zone "foo" rooted at /foo/root, and the path argument
3156  * is "/foo/root/proc", it will return the held zone_t corresponding to
3157  * zone "foo".
3158  *
3159  * zone_find_by_path() always returns a non-NULL value, since at the
3160  * very least every path will be contained in the global zone.
3161  *
3162  * As with the other zone_find_by_*() functions, the caller is
3163  * responsible for zone_rele()ing the return value of this function.
3164  */
3165 zone_t *
3166 zone_find_by_path(const char *path)
3167 {
3168         zone_t *zone;
3169         zone_t *zret = NULL;
3170         zone_status_t status;
3171 
3172         if (path == NULL) {
3173                 /*
3174                  * Call from rootconf().
3175                  */
3176                 zone_hold(global_zone);
3177                 return (global_zone);
3178         }
3179         ASSERT(*path == '/');
3180         mutex_enter(&zonehash_lock);
3181         for (zone = list_head(&zone_active); zone != NULL;
3182             zone = list_next(&zone_active, zone)) {
3183                 if (ZONE_PATH_VISIBLE(path, zone))
3184                         zret = zone;
3185         }
3186         ASSERT(zret != NULL);
3187         status = zone_status_get(zret);
3188         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3189                 /*
3190                  * Zone practically doesn't exist.
3191                  */
3192                 zret = global_zone;
3193         }
3194         zone_hold(zret);
3195         mutex_exit(&zonehash_lock);
3196         return (zret);
3197 }
3198 
3199 /*
3200  * Public interface for updating per-zone load averages.  Called once per
3201  * second.
3202  *
3203  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3204  */
3205 void
3206 zone_loadavg_update()
3207 {
3208         zone_t *zp;
3209         zone_status_t status;
3210         struct loadavg_s *lavg;
3211         hrtime_t zone_total;
3212         int i;
3213         hrtime_t hr_avg;
3214         int nrun;
3215         static int64_t f[3] = { 135, 27, 9 };
3216         int64_t q, r;
3217 
3218         mutex_enter(&zonehash_lock);
3219         for (zp = list_head(&zone_active); zp != NULL;
3220             zp = list_next(&zone_active, zp)) {
3221                 mutex_enter(&zp->zone_lock);
3222 
3223                 /* Skip zones that are on the way down or not yet up */
3224                 status = zone_status_get(zp);
3225                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3226                         /* For all practical purposes the zone doesn't exist. */
3227                         mutex_exit(&zp->zone_lock);
3228                         continue;
3229                 }
3230 
3231                 /*
3232                  * Update the 10 second moving average data in zone_loadavg.
3233                  */
3234                 lavg = &zp->zone_loadavg;
3235 
3236                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3237                 scalehrtime(&zone_total);
3238 
3239                 /* The zone_total should always be increasing. */
3240                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3241                     zone_total - lavg->lg_total : 0;
3242                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3243                 /* lg_total holds the prev. 1 sec. total */
3244                 lavg->lg_total = zone_total;
3245 
3246                 /*
3247                  * To simplify the calculation, we don't calculate the load avg.
3248                  * until the zone has been up for at least 10 seconds and our
3249                  * moving average is thus full.
3250                  */
3251                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3252                         lavg->lg_len++;
3253                         mutex_exit(&zp->zone_lock);
3254                         continue;
3255                 }
3256 
3257                 /* Now calculate the 1min, 5min, 15 min load avg. */
3258                 hr_avg = 0;
3259                 for (i = 0; i < S_LOADAVG_SZ; i++)
3260                         hr_avg += lavg->lg_loads[i];
3261                 hr_avg = hr_avg / S_LOADAVG_SZ;
3262                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3263 
3264                 /* Compute load avg. See comment in calcloadavg() */
3265                 for (i = 0; i < 3; i++) {
3266                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3267                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3268                         zp->zone_hp_avenrun[i] +=
3269                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3270 
3271                         /* avenrun[] can only hold 31 bits of load avg. */
3272                         if (zp->zone_hp_avenrun[i] <
3273                             ((uint64_t)1<<(31+16-FSHIFT)))
3274                                 zp->zone_avenrun[i] = (int32_t)
3275                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3276                         else
3277                                 zp->zone_avenrun[i] = 0x7fffffff;
3278                 }
3279 
3280                 mutex_exit(&zp->zone_lock);
3281         }
3282         mutex_exit(&zonehash_lock);
3283 }
3284 
3285 /*
3286  * Get the number of cpus visible to this zone.  The system-wide global
3287  * 'ncpus' is returned if pools are disabled, the caller is in the
3288  * global zone, or a NULL zone argument is passed in.
3289  */
3290 int
3291 zone_ncpus_get(zone_t *zone)
3292 {
3293         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3294 
3295         return (myncpus != 0 ? myncpus : ncpus);
3296 }
3297 
3298 /*
3299  * Get the number of online cpus visible to this zone.  The system-wide
3300  * global 'ncpus_online' is returned if pools are disabled, the caller
3301  * is in the global zone, or a NULL zone argument is passed in.
3302  */
3303 int
3304 zone_ncpus_online_get(zone_t *zone)
3305 {
3306         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3307 
3308         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3309 }
3310 
3311 /*
3312  * Return the pool to which the zone is currently bound.
3313  */
3314 pool_t *
3315 zone_pool_get(zone_t *zone)
3316 {
3317         ASSERT(pool_lock_held());
3318 
3319         return (zone->zone_pool);
3320 }
3321 
3322 /*
3323  * Set the zone's pool pointer and update the zone's visibility to match
3324  * the resources in the new pool.
3325  */
3326 void
3327 zone_pool_set(zone_t *zone, pool_t *pool)
3328 {
3329         ASSERT(pool_lock_held());
3330         ASSERT(MUTEX_HELD(&cpu_lock));
3331 
3332         zone->zone_pool = pool;
3333         zone_pset_set(zone, pool->pool_pset->pset_id);
3334 }
3335 
3336 /*
3337  * Return the cached value of the id of the processor set to which the
3338  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3339  * facility is disabled.
3340  */
3341 psetid_t
3342 zone_pset_get(zone_t *zone)
3343 {
3344         ASSERT(MUTEX_HELD(&cpu_lock));
3345 
3346         return (zone->zone_psetid);
3347 }
3348 
3349 /*
3350  * Set the cached value of the id of the processor set to which the zone
3351  * is currently bound.  Also update the zone's visibility to match the
3352  * resources in the new processor set.
3353  */
3354 void
3355 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3356 {
3357         psetid_t oldpsetid;
3358 
3359         ASSERT(MUTEX_HELD(&cpu_lock));
3360         oldpsetid = zone_pset_get(zone);
3361 
3362         if (oldpsetid == newpsetid)
3363                 return;
3364         /*
3365          * Global zone sees all.
3366          */
3367         if (zone != global_zone) {
3368                 zone->zone_psetid = newpsetid;
3369                 if (newpsetid != ZONE_PS_INVAL)
3370                         pool_pset_visibility_add(newpsetid, zone);
3371                 if (oldpsetid != ZONE_PS_INVAL)
3372                         pool_pset_visibility_remove(oldpsetid, zone);
3373         }
3374         /*
3375          * Disabling pools, so we should start using the global values
3376          * for ncpus and ncpus_online.
3377          */
3378         if (newpsetid == ZONE_PS_INVAL) {
3379                 zone->zone_ncpus = 0;
3380                 zone->zone_ncpus_online = 0;
3381         }
3382 }
3383 
3384 /*
3385  * Walk the list of active zones and issue the provided callback for
3386  * each of them.
3387  *
3388  * Caller must not be holding any locks that may be acquired under
3389  * zonehash_lock.  See comment at the beginning of the file for a list of
3390  * common locks and their interactions with zones.
3391  */
3392 int
3393 zone_walk(int (*cb)(zone_t *, void *), void *data)
3394 {
3395         zone_t *zone;
3396         int ret = 0;
3397         zone_status_t status;
3398 
3399         mutex_enter(&zonehash_lock);
3400         for (zone = list_head(&zone_active); zone != NULL;
3401             zone = list_next(&zone_active, zone)) {
3402                 /*
3403                  * Skip zones that shouldn't be externally visible.
3404                  */
3405                 status = zone_status_get(zone);
3406                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3407                         continue;
3408                 /*
3409                  * Bail immediately if any callback invocation returns a
3410                  * non-zero value.
3411                  */
3412                 ret = (*cb)(zone, data);
3413                 if (ret != 0)
3414                         break;
3415         }
3416         mutex_exit(&zonehash_lock);
3417         return (ret);
3418 }
3419 
3420 static int
3421 zone_set_root(zone_t *zone, const char *upath)
3422 {
3423         vnode_t *vp;
3424         int trycount;
3425         int error = 0;
3426         char *path;
3427         struct pathname upn, pn;
3428         size_t pathlen;
3429 
3430         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3431                 return (error);
3432 
3433         pn_alloc(&pn);
3434 
3435         /* prevent infinite loop */
3436         trycount = 10;
3437         for (;;) {
3438                 if (--trycount <= 0) {
3439                         error = ESTALE;
3440                         goto out;
3441                 }
3442 
3443                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3444                         /*
3445                          * VOP_ACCESS() may cover 'vp' with a new
3446                          * filesystem, if 'vp' is an autoFS vnode.
3447                          * Get the new 'vp' if so.
3448                          */
3449                         if ((error =
3450                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3451                             (!vn_ismntpt(vp) ||
3452                             (error = traverse(&vp)) == 0)) {
3453                                 pathlen = pn.pn_pathlen + 2;
3454                                 path = kmem_alloc(pathlen, KM_SLEEP);
3455                                 (void) strncpy(path, pn.pn_path,
3456                                     pn.pn_pathlen + 1);
3457                                 path[pathlen - 2] = '/';
3458                                 path[pathlen - 1] = '\0';
3459                                 pn_free(&pn);
3460                                 pn_free(&upn);
3461 
3462                                 /* Success! */
3463                                 break;
3464                         }
3465                         VN_RELE(vp);
3466                 }
3467                 if (error != ESTALE)
3468                         goto out;
3469         }
3470 
3471         ASSERT(error == 0);
3472         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3473         zone->zone_rootpath = path;
3474         zone->zone_rootpathlen = pathlen;
3475         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3476                 zone->zone_flags |= ZF_IS_SCRATCH;
3477         return (0);
3478 
3479 out:
3480         pn_free(&pn);
3481         pn_free(&upn);
3482         return (error);
3483 }
3484 
3485 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3486                         ((c) >= 'a' && (c) <= 'z') || \
3487                         ((c) >= 'A' && (c) <= 'Z'))
3488 
3489 static int
3490 zone_set_name(zone_t *zone, const char *uname)
3491 {
3492         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3493         size_t len;
3494         int i, err;
3495 
3496         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3497                 kmem_free(kname, ZONENAME_MAX);
3498                 return (err);   /* EFAULT or ENAMETOOLONG */
3499         }
3500 
3501         /* must be less than ZONENAME_MAX */
3502         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3503                 kmem_free(kname, ZONENAME_MAX);
3504                 return (EINVAL);
3505         }
3506 
3507         /*
3508          * Name must start with an alphanumeric and must contain only
3509          * alphanumerics, '-', '_' and '.'.
3510          */
3511         if (!isalnum(kname[0])) {
3512                 kmem_free(kname, ZONENAME_MAX);
3513                 return (EINVAL);
3514         }
3515         for (i = 1; i < len - 1; i++) {
3516                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3517                     kname[i] != '.') {
3518                         kmem_free(kname, ZONENAME_MAX);
3519                         return (EINVAL);
3520                 }
3521         }
3522 
3523         zone->zone_name = kname;
3524         return (0);
3525 }
3526 
3527 /*
3528  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3529  * is NULL or it points to a zone with no hostid emulation, then the machine's
3530  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3531  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3532  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3533  * hostid and the machine's hostid is invalid.
3534  */
3535 uint32_t
3536 zone_get_hostid(zone_t *zonep)
3537 {
3538         unsigned long machine_hostid;
3539 
3540         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3541                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3542                         return (HW_INVALID_HOSTID);
3543                 return ((uint32_t)machine_hostid);
3544         }
3545         return (zonep->zone_hostid);
3546 }
3547 
3548 /*
3549  * Similar to thread_create(), but makes sure the thread is in the appropriate
3550  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3551  */
3552 /*ARGSUSED*/
3553 kthread_t *
3554 zthread_create(
3555     caddr_t stk,
3556     size_t stksize,
3557     void (*proc)(),
3558     void *arg,
3559     size_t len,
3560     pri_t pri)
3561 {
3562         kthread_t *t;
3563         zone_t *zone = curproc->p_zone;
3564         proc_t *pp = zone->zone_zsched;
3565 
3566         zone_hold(zone);        /* Reference to be dropped when thread exits */
3567 
3568         /*
3569          * No-one should be trying to create threads if the zone is shutting
3570          * down and there aren't any kernel threads around.  See comment
3571          * in zthread_exit().
3572          */
3573         ASSERT(!(zone->zone_kthreads == NULL &&
3574             zone_status_get(zone) >= ZONE_IS_EMPTY));
3575         /*
3576          * Create a thread, but don't let it run until we've finished setting
3577          * things up.
3578          */
3579         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3580         ASSERT(t->t_forw == NULL);
3581         mutex_enter(&zone_status_lock);
3582         if (zone->zone_kthreads == NULL) {
3583                 t->t_forw = t->t_back = t;
3584         } else {
3585                 kthread_t *tx = zone->zone_kthreads;
3586 
3587                 t->t_forw = tx;
3588                 t->t_back = tx->t_back;
3589                 tx->t_back->t_forw = t;
3590                 tx->t_back = t;
3591         }
3592         zone->zone_kthreads = t;
3593         mutex_exit(&zone_status_lock);
3594 
3595         mutex_enter(&pp->p_lock);
3596         t->t_proc_flag |= TP_ZTHREAD;
3597         project_rele(t->t_proj);
3598         t->t_proj = project_hold(pp->p_task->tk_proj);
3599 
3600         /*
3601          * Setup complete, let it run.
3602          */
3603         thread_lock(t);
3604         t->t_schedflag |= TS_ALLSTART;
3605         setrun_locked(t);
3606         thread_unlock(t);
3607 
3608         mutex_exit(&pp->p_lock);
3609 
3610         return (t);
3611 }
3612 
3613 /*
3614  * Similar to thread_exit().  Must be called by threads created via
3615  * zthread_exit().
3616  */
3617 void
3618 zthread_exit(void)
3619 {
3620         kthread_t *t = curthread;
3621         proc_t *pp = curproc;
3622         zone_t *zone = pp->p_zone;
3623 
3624         mutex_enter(&zone_status_lock);
3625 
3626         /*
3627          * Reparent to p0
3628          */
3629         kpreempt_disable();
3630         mutex_enter(&pp->p_lock);
3631         t->t_proc_flag &= ~TP_ZTHREAD;
3632         t->t_procp = &p0;
3633         hat_thread_exit(t);
3634         mutex_exit(&pp->p_lock);
3635         kpreempt_enable();
3636 
3637         if (t->t_back == t) {
3638                 ASSERT(t->t_forw == t);
3639                 /*
3640                  * If the zone is empty, once the thread count
3641                  * goes to zero no further kernel threads can be
3642                  * created.  This is because if the creator is a process
3643                  * in the zone, then it must have exited before the zone
3644                  * state could be set to ZONE_IS_EMPTY.
3645                  * Otherwise, if the creator is a kernel thread in the
3646                  * zone, the thread count is non-zero.
3647                  *
3648                  * This really means that non-zone kernel threads should
3649                  * not create zone kernel threads.
3650                  */
3651                 zone->zone_kthreads = NULL;
3652                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3653                         zone_status_set(zone, ZONE_IS_DOWN);
3654                         /*
3655                          * Remove any CPU caps on this zone.
3656                          */
3657                         cpucaps_zone_remove(zone);
3658                 }
3659         } else {
3660                 t->t_forw->t_back = t->t_back;
3661                 t->t_back->t_forw = t->t_forw;
3662                 if (zone->zone_kthreads == t)
3663                         zone->zone_kthreads = t->t_forw;
3664         }
3665         mutex_exit(&zone_status_lock);
3666         zone_rele(zone);
3667         thread_exit();
3668         /* NOTREACHED */
3669 }
3670 
3671 static void
3672 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3673 {
3674         vnode_t *oldvp;
3675 
3676         /* we're going to hold a reference here to the directory */
3677         VN_HOLD(vp);
3678 
3679         /* update abs cwd/root path see c2/audit.c */
3680         if (AU_AUDITING())
3681                 audit_chdirec(vp, vpp);
3682 
3683         mutex_enter(&pp->p_lock);
3684         oldvp = *vpp;
3685         *vpp = vp;
3686         mutex_exit(&pp->p_lock);
3687         if (oldvp != NULL)
3688                 VN_RELE(oldvp);
3689 }
3690 
3691 /*
3692  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3693  */
3694 static int
3695 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3696 {
3697         nvpair_t *nvp = NULL;
3698         boolean_t priv_set = B_FALSE;
3699         boolean_t limit_set = B_FALSE;
3700         boolean_t action_set = B_FALSE;
3701 
3702         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3703                 const char *name;
3704                 uint64_t ui64;
3705 
3706                 name = nvpair_name(nvp);
3707                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3708                         return (EINVAL);
3709                 (void) nvpair_value_uint64(nvp, &ui64);
3710                 if (strcmp(name, "privilege") == 0) {
3711                         /*
3712                          * Currently only privileged values are allowed, but
3713                          * this may change in the future.
3714                          */
3715                         if (ui64 != RCPRIV_PRIVILEGED)
3716                                 return (EINVAL);
3717                         rv->rcv_privilege = ui64;
3718                         priv_set = B_TRUE;
3719                 } else if (strcmp(name, "limit") == 0) {
3720                         rv->rcv_value = ui64;
3721                         limit_set = B_TRUE;
3722                 } else if (strcmp(name, "action") == 0) {
3723                         if (ui64 != RCTL_LOCAL_NOACTION &&
3724                             ui64 != RCTL_LOCAL_DENY)
3725                                 return (EINVAL);
3726                         rv->rcv_flagaction = ui64;
3727                         action_set = B_TRUE;
3728                 } else {
3729                         return (EINVAL);
3730                 }
3731         }
3732 
3733         if (!(priv_set && limit_set && action_set))
3734                 return (EINVAL);
3735         rv->rcv_action_signal = 0;
3736         rv->rcv_action_recipient = NULL;
3737         rv->rcv_action_recip_pid = -1;
3738         rv->rcv_firing_time = 0;
3739 
3740         return (0);
3741 }
3742 
3743 /*
3744  * Non-global zone version of start_init.
3745  */
3746 void
3747 zone_start_init(void)
3748 {
3749         proc_t *p = ttoproc(curthread);
3750         zone_t *z = p->p_zone;
3751 
3752         ASSERT(!INGLOBALZONE(curproc));
3753 
3754         /*
3755          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3756          * storing just the pid of init is sufficient.
3757          */
3758         z->zone_proc_initpid = p->p_pid;
3759 
3760         /*
3761          * We maintain zone_boot_err so that we can return the cause of the
3762          * failure back to the caller of the zone_boot syscall.
3763          */
3764         p->p_zone->zone_boot_err = start_init_common();
3765 
3766         /*
3767          * We will prevent booting zones from becoming running zones if the
3768          * global zone is shutting down.
3769          */
3770         mutex_enter(&zone_status_lock);
3771         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3772             ZONE_IS_SHUTTING_DOWN) {
3773                 /*
3774                  * Make sure we are still in the booting state-- we could have
3775                  * raced and already be shutting down, or even further along.
3776                  */
3777                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3778                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3779                 }
3780                 mutex_exit(&zone_status_lock);
3781                 /* It's gone bad, dispose of the process */
3782                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3783                         mutex_enter(&p->p_lock);
3784                         ASSERT(p->p_flag & SEXITLWPS);
3785                         lwp_exit();
3786                 }
3787         } else {
3788                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3789                         zone_status_set(z, ZONE_IS_RUNNING);
3790                 mutex_exit(&zone_status_lock);
3791                 /* cause the process to return to userland. */
3792                 lwp_rtt();
3793         }
3794 }
3795 
3796 struct zsched_arg {
3797         zone_t *zone;
3798         nvlist_t *nvlist;
3799 };
3800 
3801 /*
3802  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3803  * anything to do with scheduling, but rather with the fact that
3804  * per-zone kernel threads are parented to zsched, just like regular
3805  * kernel threads are parented to sched (p0).
3806  *
3807  * zsched is also responsible for launching init for the zone.
3808  */
3809 static void
3810 zsched(void *arg)
3811 {
3812         struct zsched_arg *za = arg;
3813         proc_t *pp = curproc;
3814         proc_t *initp = proc_init;
3815         zone_t *zone = za->zone;
3816         cred_t *cr, *oldcred;
3817         rctl_set_t *set;
3818         rctl_alloc_gp_t *gp;
3819         contract_t *ct = NULL;
3820         task_t *tk, *oldtk;
3821         rctl_entity_p_t e;
3822         kproject_t *pj;
3823 
3824         nvlist_t *nvl = za->nvlist;
3825         nvpair_t *nvp = NULL;
3826 
3827         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3828         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3829         PTOU(pp)->u_argc = 0;
3830         PTOU(pp)->u_argv = NULL;
3831         PTOU(pp)->u_envp = NULL;
3832         PTOU(pp)->u_commpagep = NULL;
3833         closeall(P_FINFO(pp));
3834 
3835         /*
3836          * We are this zone's "zsched" process.  As the zone isn't generally
3837          * visible yet we don't need to grab any locks before initializing its
3838          * zone_proc pointer.
3839          */
3840         zone_hold(zone);  /* this hold is released by zone_destroy() */
3841         zone->zone_zsched = pp;
3842         mutex_enter(&pp->p_lock);
3843         pp->p_zone = zone;
3844         mutex_exit(&pp->p_lock);
3845 
3846         /*
3847          * Disassociate process from its 'parent'; parent ourselves to init
3848          * (pid 1) and change other values as needed.
3849          */
3850         sess_create();
3851 
3852         mutex_enter(&pidlock);
3853         proc_detach(pp);
3854         pp->p_ppid = 1;
3855         pp->p_flag |= SZONETOP;
3856         pp->p_ancpid = 1;
3857         pp->p_parent = initp;
3858         pp->p_psibling = NULL;
3859         if (initp->p_child)
3860                 initp->p_child->p_psibling = pp;
3861         pp->p_sibling = initp->p_child;
3862         initp->p_child = pp;
3863 
3864         /* Decrement what newproc() incremented. */
3865         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3866         /*
3867          * Our credentials are about to become kcred-like, so we don't care
3868          * about the caller's ruid.
3869          */
3870         upcount_inc(crgetruid(kcred), zone->zone_id);
3871         mutex_exit(&pidlock);
3872 
3873         /*
3874          * getting out of global zone, so decrement lwp and process counts
3875          */
3876         pj = pp->p_task->tk_proj;
3877         mutex_enter(&global_zone->zone_nlwps_lock);
3878         pj->kpj_nlwps -= pp->p_lwpcnt;
3879         global_zone->zone_nlwps -= pp->p_lwpcnt;
3880         pj->kpj_nprocs--;
3881         global_zone->zone_nprocs--;
3882         mutex_exit(&global_zone->zone_nlwps_lock);
3883 
3884         /*
3885          * Decrement locked memory counts on old zone and project.
3886          */
3887         mutex_enter(&global_zone->zone_mem_lock);
3888         global_zone->zone_locked_mem -= pp->p_locked_mem;
3889         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3890         mutex_exit(&global_zone->zone_mem_lock);
3891 
3892         /*
3893          * Create and join a new task in project '0' of this zone.
3894          *
3895          * We don't need to call holdlwps() since we know we're the only lwp in
3896          * this process.
3897          *
3898          * task_join() returns with p_lock held.
3899          */
3900         tk = task_create(0, zone);
3901         mutex_enter(&cpu_lock);
3902         oldtk = task_join(tk, 0);
3903 
3904         pj = pp->p_task->tk_proj;
3905 
3906         mutex_enter(&zone->zone_mem_lock);
3907         zone->zone_locked_mem += pp->p_locked_mem;
3908         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3909         mutex_exit(&zone->zone_mem_lock);
3910 
3911         /*
3912          * add lwp and process counts to zsched's zone, and increment
3913          * project's task and process count due to the task created in
3914          * the above task_create.
3915          */
3916         mutex_enter(&zone->zone_nlwps_lock);
3917         pj->kpj_nlwps += pp->p_lwpcnt;
3918         pj->kpj_ntasks += 1;
3919         zone->zone_nlwps += pp->p_lwpcnt;
3920         pj->kpj_nprocs++;
3921         zone->zone_nprocs++;
3922         mutex_exit(&zone->zone_nlwps_lock);
3923 
3924         mutex_exit(&curproc->p_lock);
3925         mutex_exit(&cpu_lock);
3926         task_rele(oldtk);
3927 
3928         /*
3929          * The process was created by a process in the global zone, hence the
3930          * credentials are wrong.  We might as well have kcred-ish credentials.
3931          */
3932         cr = zone->zone_kcred;
3933         crhold(cr);
3934         mutex_enter(&pp->p_crlock);
3935         oldcred = pp->p_cred;
3936         pp->p_cred = cr;
3937         mutex_exit(&pp->p_crlock);
3938         crfree(oldcred);
3939 
3940         /*
3941          * Hold credentials again (for thread)
3942          */
3943         crhold(cr);
3944 
3945         /*
3946          * p_lwpcnt can't change since this is a kernel process.
3947          */
3948         crset(pp, cr);
3949 
3950         /*
3951          * Chroot
3952          */
3953         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3954         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3955 
3956         /*
3957          * Initialize zone's rctl set.
3958          */
3959         set = rctl_set_create();
3960         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3961         mutex_enter(&pp->p_lock);
3962         e.rcep_p.zone = zone;
3963         e.rcep_t = RCENTITY_ZONE;
3964         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3965         mutex_exit(&pp->p_lock);
3966         rctl_prealloc_destroy(gp);
3967 
3968         /*
3969          * Apply the rctls passed in to zone_create().  This is basically a list
3970          * assignment: all of the old values are removed and the new ones
3971          * inserted.  That is, if an empty list is passed in, all values are
3972          * removed.
3973          */
3974         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3975                 rctl_dict_entry_t *rde;
3976                 rctl_hndl_t hndl;
3977                 char *name;
3978                 nvlist_t **nvlarray;
3979                 uint_t i, nelem;
3980                 int error;      /* For ASSERT()s */
3981 
3982                 name = nvpair_name(nvp);
3983                 hndl = rctl_hndl_lookup(name);
3984                 ASSERT(hndl != -1);
3985                 rde = rctl_dict_lookup_hndl(hndl);
3986                 ASSERT(rde != NULL);
3987 
3988                 for (; /* ever */; ) {
3989                         rctl_val_t oval;
3990 
3991                         mutex_enter(&pp->p_lock);
3992                         error = rctl_local_get(hndl, NULL, &oval, pp);
3993                         mutex_exit(&pp->p_lock);
3994                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3995                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3996                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3997                                 break;
3998                         mutex_enter(&pp->p_lock);
3999                         error = rctl_local_delete(hndl, &oval, pp);
4000                         mutex_exit(&pp->p_lock);
4001                         ASSERT(error == 0);
4002                 }
4003                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4004                 ASSERT(error == 0);
4005                 for (i = 0; i < nelem; i++) {
4006                         rctl_val_t *nvalp;
4007 
4008                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4009                         error = nvlist2rctlval(nvlarray[i], nvalp);
4010                         ASSERT(error == 0);
4011                         /*
4012                          * rctl_local_insert can fail if the value being
4013                          * inserted is a duplicate; this is OK.
4014                          */
4015                         mutex_enter(&pp->p_lock);
4016                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4017                                 kmem_cache_free(rctl_val_cache, nvalp);
4018                         mutex_exit(&pp->p_lock);
4019                 }
4020         }
4021 
4022         /*
4023          * Tell the world that we're done setting up.
4024          *
4025          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4026          * and atomically set the zone's processor set visibility.  Once
4027          * we drop pool_lock() this zone will automatically get updated
4028          * to reflect any future changes to the pools configuration.
4029          *
4030          * Note that after we drop the locks below (zonehash_lock in
4031          * particular) other operations such as a zone_getattr call can
4032          * now proceed and observe the zone. That is the reason for doing a
4033          * state transition to the INITIALIZED state.
4034          */
4035         pool_lock();
4036         mutex_enter(&cpu_lock);
4037         mutex_enter(&zonehash_lock);
4038         zone_uniqid(zone);
4039         zone_zsd_configure(zone);
4040         if (pool_state == POOL_ENABLED)
4041                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4042         mutex_enter(&zone_status_lock);
4043         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4044         zone_status_set(zone, ZONE_IS_INITIALIZED);
4045         mutex_exit(&zone_status_lock);
4046         mutex_exit(&zonehash_lock);
4047         mutex_exit(&cpu_lock);
4048         pool_unlock();
4049 
4050         /* Now call the create callback for this key */
4051         zsd_apply_all_keys(zsd_apply_create, zone);
4052 
4053         /* The callbacks are complete. Mark ZONE_IS_READY */
4054         mutex_enter(&zone_status_lock);
4055         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4056         zone_status_set(zone, ZONE_IS_READY);
4057         mutex_exit(&zone_status_lock);
4058 
4059         /*
4060          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4061          * we launch init, and set the state to running.
4062          */
4063         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4064 
4065         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4066                 id_t cid;
4067 
4068                 /*
4069                  * Ok, this is a little complicated.  We need to grab the
4070                  * zone's pool's scheduling class ID; note that by now, we
4071                  * are already bound to a pool if we need to be (zoneadmd
4072                  * will have done that to us while we're in the READY
4073                  * state).  *But* the scheduling class for the zone's 'init'
4074                  * must be explicitly passed to newproc, which doesn't
4075                  * respect pool bindings.
4076                  *
4077                  * We hold the pool_lock across the call to newproc() to
4078                  * close the obvious race: the pool's scheduling class
4079                  * could change before we manage to create the LWP with
4080                  * classid 'cid'.
4081                  */
4082                 pool_lock();
4083                 if (zone->zone_defaultcid > 0)
4084                         cid = zone->zone_defaultcid;
4085                 else
4086                         cid = pool_get_class(zone->zone_pool);
4087                 if (cid == -1)
4088                         cid = defaultcid;
4089 
4090                 /*
4091                  * If this fails, zone_boot will ultimately fail.  The
4092                  * state of the zone will be set to SHUTTING_DOWN-- userland
4093                  * will have to tear down the zone, and fail, or try again.
4094                  */
4095                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4096                     minclsyspri - 1, &ct, 0)) != 0) {
4097                         mutex_enter(&zone_status_lock);
4098                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4099                         mutex_exit(&zone_status_lock);
4100                 } else {
4101                         zone->zone_boot_time = gethrestime_sec();
4102                 }
4103 
4104                 pool_unlock();
4105         }
4106 
4107         /*
4108          * Wait for zone_destroy() to be called.  This is what we spend
4109          * most of our life doing.
4110          */
4111         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4112 
4113         if (ct)
4114                 /*
4115                  * At this point the process contract should be empty.
4116                  * (Though if it isn't, it's not the end of the world.)
4117                  */
4118                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4119 
4120         /*
4121          * Allow kcred to be freed when all referring processes
4122          * (including this one) go away.  We can't just do this in
4123          * zone_free because we need to wait for the zone_cred_ref to
4124          * drop to 0 before calling zone_free, and the existence of
4125          * zone_kcred will prevent that.  Thus, we call crfree here to
4126          * balance the crdup in zone_create.  The crhold calls earlier
4127          * in zsched will be dropped when the thread and process exit.
4128          */
4129         crfree(zone->zone_kcred);
4130         zone->zone_kcred = NULL;
4131 
4132         exit(CLD_EXITED, 0);
4133 }
4134 
4135 /*
4136  * Helper function to determine if there are any submounts of the
4137  * provided path.  Used to make sure the zone doesn't "inherit" any
4138  * mounts from before it is created.
4139  */
4140 static uint_t
4141 zone_mount_count(const char *rootpath)
4142 {
4143         vfs_t *vfsp;
4144         uint_t count = 0;
4145         size_t rootpathlen = strlen(rootpath);
4146 
4147         /*
4148          * Holding zonehash_lock prevents race conditions with
4149          * vfs_list_add()/vfs_list_remove() since we serialize with
4150          * zone_find_by_path().
4151          */
4152         ASSERT(MUTEX_HELD(&zonehash_lock));
4153         /*
4154          * The rootpath must end with a '/'
4155          */
4156         ASSERT(rootpath[rootpathlen - 1] == '/');
4157 
4158         /*
4159          * This intentionally does not count the rootpath itself if that
4160          * happens to be a mount point.
4161          */
4162         vfs_list_read_lock();
4163         vfsp = rootvfs;
4164         do {
4165                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4166                     rootpathlen) == 0)
4167                         count++;
4168                 vfsp = vfsp->vfs_next;
4169         } while (vfsp != rootvfs);
4170         vfs_list_unlock();
4171         return (count);
4172 }
4173 
4174 /*
4175  * Helper function to make sure that a zone created on 'rootpath'
4176  * wouldn't end up containing other zones' rootpaths.
4177  */
4178 static boolean_t
4179 zone_is_nested(const char *rootpath)
4180 {
4181         zone_t *zone;
4182         size_t rootpathlen = strlen(rootpath);
4183         size_t len;
4184 
4185         ASSERT(MUTEX_HELD(&zonehash_lock));
4186 
4187         /*
4188          * zone_set_root() appended '/' and '\0' at the end of rootpath
4189          */
4190         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4191             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4192                 return (B_TRUE);
4193 
4194         for (zone = list_head(&zone_active); zone != NULL;
4195             zone = list_next(&zone_active, zone)) {
4196                 if (zone == global_zone)
4197                         continue;
4198                 len = strlen(zone->zone_rootpath);
4199                 if (strncmp(rootpath, zone->zone_rootpath,
4200                     MIN(rootpathlen, len)) == 0)
4201                         return (B_TRUE);
4202         }
4203         return (B_FALSE);
4204 }
4205 
4206 static int
4207 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4208     size_t zone_privssz)
4209 {
4210         priv_set_t *privs;
4211 
4212         if (zone_privssz < sizeof (priv_set_t))
4213                 return (ENOMEM);
4214 
4215         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4216 
4217         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4218                 kmem_free(privs, sizeof (priv_set_t));
4219                 return (EFAULT);
4220         }
4221 
4222         zone->zone_privset = privs;
4223         return (0);
4224 }
4225 
4226 /*
4227  * We make creative use of nvlists to pass in rctls from userland.  The list is
4228  * a list of the following structures:
4229  *
4230  * (name = rctl_name, value = nvpair_list_array)
4231  *
4232  * Where each element of the nvpair_list_array is of the form:
4233  *
4234  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4235  *      (name = "limit", value = uint64_t),
4236  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4237  */
4238 static int
4239 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4240 {
4241         nvpair_t *nvp = NULL;
4242         nvlist_t *nvl = NULL;
4243         char *kbuf;
4244         int error;
4245         rctl_val_t rv;
4246 
4247         *nvlp = NULL;
4248 
4249         if (buflen == 0)
4250                 return (0);
4251 
4252         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4253                 return (ENOMEM);
4254         if (copyin(ubuf, kbuf, buflen)) {
4255                 error = EFAULT;
4256                 goto out;
4257         }
4258         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4259                 /*
4260                  * nvl may have been allocated/free'd, but the value set to
4261                  * non-NULL, so we reset it here.
4262                  */
4263                 nvl = NULL;
4264                 error = EINVAL;
4265                 goto out;
4266         }
4267         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4268                 rctl_dict_entry_t *rde;
4269                 rctl_hndl_t hndl;
4270                 nvlist_t **nvlarray;
4271                 uint_t i, nelem;
4272                 char *name;
4273 
4274                 error = EINVAL;
4275                 name = nvpair_name(nvp);
4276                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4277                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4278                         goto out;
4279                 }
4280                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4281                         goto out;
4282                 }
4283                 rde = rctl_dict_lookup_hndl(hndl);
4284                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4285                 ASSERT(error == 0);
4286                 for (i = 0; i < nelem; i++) {
4287                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4288                                 goto out;
4289                 }
4290                 if (rctl_invalid_value(rde, &rv)) {
4291                         error = EINVAL;
4292                         goto out;
4293                 }
4294         }
4295         error = 0;
4296         *nvlp = nvl;
4297 out:
4298         kmem_free(kbuf, buflen);
4299         if (error && nvl != NULL)
4300                 nvlist_free(nvl);
4301         return (error);
4302 }
4303 
4304 int
4305 zone_create_error(int er_error, int er_ext, int *er_out)
4306 {
4307         if (er_out != NULL) {
4308                 if (copyout(&er_ext, er_out, sizeof (int))) {
4309                         return (set_errno(EFAULT));
4310                 }
4311         }
4312         return (set_errno(er_error));
4313 }
4314 
4315 static int
4316 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4317 {
4318         ts_label_t *tsl;
4319         bslabel_t blab;
4320 
4321         /* Get label from user */
4322         if (copyin(lab, &blab, sizeof (blab)) != 0)
4323                 return (EFAULT);
4324         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4325         if (tsl == NULL)
4326                 return (ENOMEM);
4327 
4328         zone->zone_slabel = tsl;
4329         return (0);
4330 }
4331 
4332 /*
4333  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4334  */
4335 static int
4336 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4337 {
4338         char *kbuf;
4339         char *dataset, *next;
4340         zone_dataset_t *zd;
4341         size_t len;
4342 
4343         if (ubuf == NULL || buflen == 0)
4344                 return (0);
4345 
4346         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4347                 return (ENOMEM);
4348 
4349         if (copyin(ubuf, kbuf, buflen) != 0) {
4350                 kmem_free(kbuf, buflen);
4351                 return (EFAULT);
4352         }
4353 
4354         dataset = next = kbuf;
4355         for (;;) {
4356                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4357 
4358                 next = strchr(dataset, ',');
4359 
4360                 if (next == NULL)
4361                         len = strlen(dataset);
4362                 else
4363                         len = next - dataset;
4364 
4365                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4366                 bcopy(dataset, zd->zd_dataset, len);
4367                 zd->zd_dataset[len] = '\0';
4368 
4369                 list_insert_head(&zone->zone_datasets, zd);
4370 
4371                 if (next == NULL)
4372                         break;
4373 
4374                 dataset = next + 1;
4375         }
4376 
4377         kmem_free(kbuf, buflen);
4378         return (0);
4379 }
4380 
4381 /*
4382  * System call to create/initialize a new zone named 'zone_name', rooted
4383  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4384  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4385  * with labeling set by 'match', 'doi', and 'label'.
4386  *
4387  * If extended error is non-null, we may use it to return more detailed
4388  * error information.
4389  */
4390 static zoneid_t
4391 zone_create(const char *zone_name, const char *zone_root,
4392     const priv_set_t *zone_privs, size_t zone_privssz,
4393     caddr_t rctlbuf, size_t rctlbufsz,
4394     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4395     int match, uint32_t doi, const bslabel_t *label,
4396     int flags)
4397 {
4398         struct zsched_arg zarg;
4399         nvlist_t *rctls = NULL;
4400         proc_t *pp = curproc;
4401         zone_t *zone, *ztmp;
4402         zoneid_t zoneid, start = GLOBAL_ZONEID;
4403         int error;
4404         int error2 = 0;
4405         char *str;
4406         cred_t *zkcr;
4407         boolean_t insert_label_hash;
4408 
4409         if (secpolicy_zone_config(CRED()) != 0)
4410                 return (set_errno(EPERM));
4411 
4412         /* can't boot zone from within chroot environment */
4413         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4414                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4415                     extended_error));
4416         /*
4417          * As the first step of zone creation, we want to allocate a zoneid.
4418          * This allocation is complicated by the fact that netstacks use the
4419          * zoneid to determine their stackid, but netstacks themselves are
4420          * freed asynchronously with respect to zone destruction.  This means
4421          * that a netstack reference leak (or in principle, an extraordinarily
4422          * long netstack reference hold) could result in a zoneid being
4423          * allocated that in fact corresponds to a stackid from an active
4424          * (referenced) netstack -- unleashing all sorts of havoc when that
4425          * netstack is actually (re)used.  (In the abstract, we might wish a
4426          * zoneid to not be deallocated until its last referencing netstack
4427          * has been released, but netstacks lack a backpointer into their
4428          * referencing zone -- and changing them to have such a pointer would
4429          * be substantial, to put it euphemistically.)  To avoid this, we
4430          * detect this condition on allocation: if we have allocated a zoneid
4431          * that corresponds to a netstack that's still in use, we warn about
4432          * it (as it is much more likely to be a reference leak than an actual
4433          * netstack reference), free it, and allocate another.  That these
4434          * identifers are allocated out of an ID space assures that we won't
4435          * see the identifier we just allocated.
4436          */
4437         for (;;) {
4438                 zoneid = id_alloc(zoneid_space);
4439 
4440                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4441                         break;
4442 
4443                 id_free(zoneid_space, zoneid);
4444 
4445                 if (start == GLOBAL_ZONEID) {
4446                         start = zoneid;
4447                 } else if (zoneid == start) {
4448                         /*
4449                          * We have managed to iterate over the entire available
4450                          * zoneid space -- there are no identifiers available,
4451                          * presumably due to some number of leaked netstack
4452                          * references.  While it's in principle possible for us
4453                          * to continue to try, it seems wiser to give up at
4454                          * this point to warn and fail explicitly with a
4455                          * distinctive error.
4456                          */
4457                         cmn_err(CE_WARN, "zone_create() failed: all available "
4458                             "zone IDs have netstacks still in use");
4459                         return (set_errno(ENFILE));
4460                 }
4461 
4462                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4463                     "netstack still in use", zoneid);
4464         }
4465 
4466         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4467         zone->zone_id = zoneid;
4468         zone->zone_status = ZONE_IS_UNINITIALIZED;
4469         zone->zone_pool = pool_default;
4470         zone->zone_pool_mod = gethrtime();
4471         zone->zone_psetid = ZONE_PS_INVAL;
4472         zone->zone_ncpus = 0;
4473         zone->zone_ncpus_online = 0;
4474         zone->zone_restart_init = B_TRUE;
4475         zone->zone_brand = &native_brand;
4476         zone->zone_initname = NULL;
4477         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4478         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4479         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4480         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4481         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4482             offsetof(zone_ref_t, zref_linkage));
4483         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4484             offsetof(struct zsd_entry, zsd_linkage));
4485         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4486             offsetof(zone_dataset_t, zd_linkage));
4487         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4488             offsetof(zone_dl_t, zdl_linkage));
4489         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4490         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4491 
4492         if (flags & ZCF_NET_EXCL) {
4493                 zone->zone_flags |= ZF_NET_EXCL;
4494         }
4495 
4496         if ((error = zone_set_name(zone, zone_name)) != 0) {
4497                 zone_free(zone);
4498                 return (zone_create_error(error, 0, extended_error));
4499         }
4500 
4501         if ((error = zone_set_root(zone, zone_root)) != 0) {
4502                 zone_free(zone);
4503                 return (zone_create_error(error, 0, extended_error));
4504         }
4505         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4506                 zone_free(zone);
4507                 return (zone_create_error(error, 0, extended_error));
4508         }
4509 
4510         /* initialize node name to be the same as zone name */
4511         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4512         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4513         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4514 
4515         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4516         zone->zone_domain[0] = '\0';
4517         zone->zone_hostid = HW_INVALID_HOSTID;
4518         zone->zone_shares = 1;
4519         zone->zone_shmmax = 0;
4520         zone->zone_ipc.ipcq_shmmni = 0;
4521         zone->zone_ipc.ipcq_semmni = 0;
4522         zone->zone_ipc.ipcq_msgmni = 0;
4523         zone->zone_bootargs = NULL;
4524         zone->zone_fs_allowed = NULL;
4525 
4526         secflags_zero(&zone0.zone_secflags.psf_lower);
4527         secflags_zero(&zone0.zone_secflags.psf_effective);
4528         secflags_zero(&zone0.zone_secflags.psf_inherit);
4529         secflags_fullset(&zone0.zone_secflags.psf_upper);
4530 
4531         zone->zone_initname =
4532             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4533         (void) strcpy(zone->zone_initname, zone_default_initname);
4534         zone->zone_nlwps = 0;
4535         zone->zone_nlwps_ctl = INT_MAX;
4536         zone->zone_nprocs = 0;
4537         zone->zone_nprocs_ctl = INT_MAX;
4538         zone->zone_locked_mem = 0;
4539         zone->zone_locked_mem_ctl = UINT64_MAX;
4540         zone->zone_max_swap = 0;
4541         zone->zone_max_swap_ctl = UINT64_MAX;
4542         zone->zone_max_lofi = 0;
4543         zone->zone_max_lofi_ctl = UINT64_MAX;
4544         zone0.zone_lockedmem_kstat = NULL;
4545         zone0.zone_swapresv_kstat = NULL;
4546 
4547         /*
4548          * Zsched initializes the rctls.
4549          */
4550         zone->zone_rctls = NULL;
4551 
4552         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4553                 zone_free(zone);
4554                 return (zone_create_error(error, 0, extended_error));
4555         }
4556 
4557         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4558                 zone_free(zone);
4559                 return (set_errno(error));
4560         }
4561 
4562         /*
4563          * Read in the trusted system parameters:
4564          * match flag and sensitivity label.
4565          */
4566         zone->zone_match = match;
4567         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4568                 /* Fail if requested to set doi to anything but system's doi */
4569                 if (doi != 0 && doi != default_doi) {
4570                         zone_free(zone);
4571                         return (set_errno(EINVAL));
4572                 }
4573                 /* Always apply system's doi to the zone */
4574                 error = zone_set_label(zone, label, default_doi);
4575                 if (error != 0) {
4576                         zone_free(zone);
4577                         return (set_errno(error));
4578                 }
4579                 insert_label_hash = B_TRUE;
4580         } else {
4581                 /* all zones get an admin_low label if system is not labeled */
4582                 zone->zone_slabel = l_admin_low;
4583                 label_hold(l_admin_low);
4584                 insert_label_hash = B_FALSE;
4585         }
4586 
4587         /*
4588          * Stop all lwps since that's what normally happens as part of fork().
4589          * This needs to happen before we grab any locks to avoid deadlock
4590          * (another lwp in the process could be waiting for the held lock).
4591          */
4592         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4593                 zone_free(zone);
4594                 nvlist_free(rctls);
4595                 return (zone_create_error(error, 0, extended_error));
4596         }
4597 
4598         if (block_mounts(zone) == 0) {
4599                 mutex_enter(&pp->p_lock);
4600                 if (curthread != pp->p_agenttp)
4601                         continuelwps(pp);
4602                 mutex_exit(&pp->p_lock);
4603                 zone_free(zone);
4604                 nvlist_free(rctls);
4605                 return (zone_create_error(error, 0, extended_error));
4606         }
4607 
4608         /*
4609          * Set up credential for kernel access.  After this, any errors
4610          * should go through the dance in errout rather than calling
4611          * zone_free directly.
4612          */
4613         zone->zone_kcred = crdup(kcred);
4614         crsetzone(zone->zone_kcred, zone);
4615         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4616         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4617         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4618         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4619 
4620         mutex_enter(&zonehash_lock);
4621         /*
4622          * Make sure zone doesn't already exist.
4623          *
4624          * If the system and zone are labeled,
4625          * make sure no other zone exists that has the same label.
4626          */
4627         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4628             (insert_label_hash &&
4629             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4630                 zone_status_t status;
4631 
4632                 status = zone_status_get(ztmp);
4633                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4634                         error = EEXIST;
4635                 else
4636                         error = EBUSY;
4637 
4638                 if (insert_label_hash)
4639                         error2 = ZE_LABELINUSE;
4640 
4641                 goto errout;
4642         }
4643 
4644         /*
4645          * Don't allow zone creations which would cause one zone's rootpath to
4646          * be accessible from that of another (non-global) zone.
4647          */
4648         if (zone_is_nested(zone->zone_rootpath)) {
4649                 error = EBUSY;
4650                 goto errout;
4651         }
4652 
4653         ASSERT(zonecount != 0);         /* check for leaks */
4654         if (zonecount + 1 > maxzones) {
4655                 error = ENOMEM;
4656                 goto errout;
4657         }
4658 
4659         if (zone_mount_count(zone->zone_rootpath) != 0) {
4660                 error = EBUSY;
4661                 error2 = ZE_AREMOUNTS;
4662                 goto errout;
4663         }
4664 
4665         /*
4666          * Zone is still incomplete, but we need to drop all locks while
4667          * zsched() initializes this zone's kernel process.  We
4668          * optimistically add the zone to the hashtable and associated
4669          * lists so a parallel zone_create() doesn't try to create the
4670          * same zone.
4671          */
4672         zonecount++;
4673         (void) mod_hash_insert(zonehashbyid,
4674             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4675             (mod_hash_val_t)(uintptr_t)zone);
4676         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4677         (void) strcpy(str, zone->zone_name);
4678         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4679             (mod_hash_val_t)(uintptr_t)zone);
4680         if (insert_label_hash) {
4681                 (void) mod_hash_insert(zonehashbylabel,
4682                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4683                 zone->zone_flags |= ZF_HASHED_LABEL;
4684         }
4685 
4686         /*
4687          * Insert into active list.  At this point there are no 'hold's
4688          * on the zone, but everyone else knows not to use it, so we can
4689          * continue to use it.  zsched() will do a zone_hold() if the
4690          * newproc() is successful.
4691          */
4692         list_insert_tail(&zone_active, zone);
4693         mutex_exit(&zonehash_lock);
4694 
4695         zarg.zone = zone;
4696         zarg.nvlist = rctls;
4697         /*
4698          * The process, task, and project rctls are probably wrong;
4699          * we need an interface to get the default values of all rctls,
4700          * and initialize zsched appropriately.  I'm not sure that that
4701          * makes much of a difference, though.
4702          */
4703         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4704         if (error != 0) {
4705                 /*
4706                  * We need to undo all globally visible state.
4707                  */
4708                 mutex_enter(&zonehash_lock);
4709                 list_remove(&zone_active, zone);
4710                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4711                         ASSERT(zone->zone_slabel != NULL);
4712                         (void) mod_hash_destroy(zonehashbylabel,
4713                             (mod_hash_key_t)zone->zone_slabel);
4714                 }
4715                 (void) mod_hash_destroy(zonehashbyname,
4716                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4717                 (void) mod_hash_destroy(zonehashbyid,
4718                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4719                 ASSERT(zonecount > 1);
4720                 zonecount--;
4721                 goto errout;
4722         }
4723 
4724         /*
4725          * Zone creation can't fail from now on.
4726          */
4727 
4728         /*
4729          * Create zone kstats
4730          */
4731         zone_kstat_create(zone);
4732 
4733         /*
4734          * Let the other lwps continue.
4735          */
4736         mutex_enter(&pp->p_lock);
4737         if (curthread != pp->p_agenttp)
4738                 continuelwps(pp);
4739         mutex_exit(&pp->p_lock);
4740 
4741         /*
4742          * Wait for zsched to finish initializing the zone.
4743          */
4744         zone_status_wait(zone, ZONE_IS_READY);
4745         /*
4746          * The zone is fully visible, so we can let mounts progress.
4747          */
4748         resume_mounts(zone);
4749         nvlist_free(rctls);
4750 
4751         return (zoneid);
4752 
4753 errout:
4754         mutex_exit(&zonehash_lock);
4755         /*
4756          * Let the other lwps continue.
4757          */
4758         mutex_enter(&pp->p_lock);
4759         if (curthread != pp->p_agenttp)
4760                 continuelwps(pp);
4761         mutex_exit(&pp->p_lock);
4762 
4763         resume_mounts(zone);
4764         nvlist_free(rctls);
4765         /*
4766          * There is currently one reference to the zone, a cred_ref from
4767          * zone_kcred.  To free the zone, we call crfree, which will call
4768          * zone_cred_rele, which will call zone_free.
4769          */
4770         ASSERT(zone->zone_cred_ref == 1);
4771         ASSERT(zone->zone_kcred->cr_ref == 1);
4772         ASSERT(zone->zone_ref == 0);
4773         zkcr = zone->zone_kcred;
4774         zone->zone_kcred = NULL;
4775         crfree(zkcr);                           /* triggers call to zone_free */
4776         return (zone_create_error(error, error2, extended_error));
4777 }
4778 
4779 /*
4780  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4781  * the heavy lifting.  initname is the path to the program to launch
4782  * at the "top" of the zone; if this is NULL, we use the system default,
4783  * which is stored at zone_default_initname.
4784  */
4785 static int
4786 zone_boot(zoneid_t zoneid)
4787 {
4788         int err;
4789         zone_t *zone;
4790 
4791         if (secpolicy_zone_config(CRED()) != 0)
4792                 return (set_errno(EPERM));
4793         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4794                 return (set_errno(EINVAL));
4795 
4796         mutex_enter(&zonehash_lock);
4797         /*
4798          * Look for zone under hash lock to prevent races with calls to
4799          * zone_shutdown, zone_destroy, etc.
4800          */
4801         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4802                 mutex_exit(&zonehash_lock);
4803                 return (set_errno(EINVAL));
4804         }
4805 
4806         mutex_enter(&zone_status_lock);
4807         if (zone_status_get(zone) != ZONE_IS_READY) {
4808                 mutex_exit(&zone_status_lock);
4809                 mutex_exit(&zonehash_lock);
4810                 return (set_errno(EINVAL));
4811         }
4812         zone_status_set(zone, ZONE_IS_BOOTING);
4813         mutex_exit(&zone_status_lock);
4814 
4815         zone_hold(zone);        /* so we can use the zone_t later */
4816         mutex_exit(&zonehash_lock);
4817 
4818         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4819                 zone_rele(zone);
4820                 return (set_errno(EINTR));
4821         }
4822 
4823         /*
4824          * Boot (starting init) might have failed, in which case the zone
4825          * will go to the SHUTTING_DOWN state; an appropriate errno will
4826          * be placed in zone->zone_boot_err, and so we return that.
4827          */
4828         err = zone->zone_boot_err;
4829         zone_rele(zone);
4830         return (err ? set_errno(err) : 0);
4831 }
4832 
4833 /*
4834  * Kills all user processes in the zone, waiting for them all to exit
4835  * before returning.
4836  */
4837 static int
4838 zone_empty(zone_t *zone)
4839 {
4840         int waitstatus;
4841 
4842         /*
4843          * We need to drop zonehash_lock before killing all
4844          * processes, otherwise we'll deadlock with zone_find_*
4845          * which can be called from the exit path.
4846          */
4847         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4848         while ((waitstatus = zone_status_timedwait_sig(zone,
4849             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4850                 killall(zone->zone_id);
4851         }
4852         /*
4853          * return EINTR if we were signaled
4854          */
4855         if (waitstatus == 0)
4856                 return (EINTR);
4857         return (0);
4858 }
4859 
4860 /*
4861  * This function implements the policy for zone visibility.
4862  *
4863  * In standard Solaris, a non-global zone can only see itself.
4864  *
4865  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4866  * it dominates. For this test, the label of the global zone is treated as
4867  * admin_high so it is special-cased instead of being checked for dominance.
4868  *
4869  * Returns true if zone attributes are viewable, false otherwise.
4870  */
4871 static boolean_t
4872 zone_list_access(zone_t *zone)
4873 {
4874 
4875         if (curproc->p_zone == global_zone ||
4876             curproc->p_zone == zone) {
4877                 return (B_TRUE);
4878         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4879                 bslabel_t *curproc_label;
4880                 bslabel_t *zone_label;
4881 
4882                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4883                 zone_label = label2bslabel(zone->zone_slabel);
4884 
4885                 if (zone->zone_id != GLOBAL_ZONEID &&
4886                     bldominates(curproc_label, zone_label)) {
4887                         return (B_TRUE);
4888                 } else {
4889                         return (B_FALSE);
4890                 }
4891         } else {
4892                 return (B_FALSE);
4893         }
4894 }
4895 
4896 /*
4897  * Systemcall to start the zone's halt sequence.  By the time this
4898  * function successfully returns, all user processes and kernel threads
4899  * executing in it will have exited, ZSD shutdown callbacks executed,
4900  * and the zone status set to ZONE_IS_DOWN.
4901  *
4902  * It is possible that the call will interrupt itself if the caller is the
4903  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4904  */
4905 static int
4906 zone_shutdown(zoneid_t zoneid)
4907 {
4908         int error;
4909         zone_t *zone;
4910         zone_status_t status;
4911 
4912         if (secpolicy_zone_config(CRED()) != 0)
4913                 return (set_errno(EPERM));
4914         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4915                 return (set_errno(EINVAL));
4916 
4917         mutex_enter(&zonehash_lock);
4918         /*
4919          * Look for zone under hash lock to prevent races with other
4920          * calls to zone_shutdown and zone_destroy.
4921          */
4922         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4923                 mutex_exit(&zonehash_lock);
4924                 return (set_errno(EINVAL));
4925         }
4926 
4927         /*
4928          * We have to drop zonehash_lock before calling block_mounts.
4929          * Hold the zone so we can continue to use the zone_t.
4930          */
4931         zone_hold(zone);
4932         mutex_exit(&zonehash_lock);
4933 
4934         /*
4935          * Block mounts so that VFS_MOUNT() can get an accurate view of
4936          * the zone's status with regards to ZONE_IS_SHUTTING down.
4937          *
4938          * e.g. NFS can fail the mount if it determines that the zone
4939          * has already begun the shutdown sequence.
4940          *
4941          */
4942         if (block_mounts(zone) == 0) {
4943                 zone_rele(zone);
4944                 return (set_errno(EINTR));
4945         }
4946 
4947         mutex_enter(&zonehash_lock);
4948         mutex_enter(&zone_status_lock);
4949         status = zone_status_get(zone);
4950         /*
4951          * Fail if the zone isn't fully initialized yet.
4952          */
4953         if (status < ZONE_IS_READY) {
4954                 mutex_exit(&zone_status_lock);
4955                 mutex_exit(&zonehash_lock);
4956                 resume_mounts(zone);
4957                 zone_rele(zone);
4958                 return (set_errno(EINVAL));
4959         }
4960         /*
4961          * If conditions required for zone_shutdown() to return have been met,
4962          * return success.
4963          */
4964         if (status >= ZONE_IS_DOWN) {
4965                 mutex_exit(&zone_status_lock);
4966                 mutex_exit(&zonehash_lock);
4967                 resume_mounts(zone);
4968                 zone_rele(zone);
4969                 return (0);
4970         }
4971         /*
4972          * If zone_shutdown() hasn't been called before, go through the motions.
4973          * If it has, there's nothing to do but wait for the kernel threads to
4974          * drain.
4975          */
4976         if (status < ZONE_IS_EMPTY) {
4977                 uint_t ntasks;
4978 
4979                 mutex_enter(&zone->zone_lock);
4980                 if ((ntasks = zone->zone_ntasks) != 1) {
4981                         /*
4982                          * There's still stuff running.
4983                          */
4984                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4985                 }
4986                 mutex_exit(&zone->zone_lock);
4987                 if (ntasks == 1) {
4988                         /*
4989                          * The only way to create another task is through
4990                          * zone_enter(), which will block until we drop
4991                          * zonehash_lock.  The zone is empty.
4992                          */
4993                         if (zone->zone_kthreads == NULL) {
4994                                 /*
4995                                  * Skip ahead to ZONE_IS_DOWN
4996                                  */
4997                                 zone_status_set(zone, ZONE_IS_DOWN);
4998                         } else {
4999                                 zone_status_set(zone, ZONE_IS_EMPTY);
5000                         }
5001                 }
5002         }
5003         mutex_exit(&zone_status_lock);
5004         mutex_exit(&zonehash_lock);
5005         resume_mounts(zone);
5006 
5007         if (error = zone_empty(zone)) {
5008                 zone_rele(zone);
5009                 return (set_errno(error));
5010         }
5011         /*
5012          * After the zone status goes to ZONE_IS_DOWN this zone will no
5013          * longer be notified of changes to the pools configuration, so
5014          * in order to not end up with a stale pool pointer, we point
5015          * ourselves at the default pool and remove all resource
5016          * visibility.  This is especially important as the zone_t may
5017          * languish on the deathrow for a very long time waiting for
5018          * cred's to drain out.
5019          *
5020          * This rebinding of the zone can happen multiple times
5021          * (presumably due to interrupted or parallel systemcalls)
5022          * without any adverse effects.
5023          */
5024         if (pool_lock_intr() != 0) {
5025                 zone_rele(zone);
5026                 return (set_errno(EINTR));
5027         }
5028         if (pool_state == POOL_ENABLED) {
5029                 mutex_enter(&cpu_lock);
5030                 zone_pool_set(zone, pool_default);
5031                 /*
5032                  * The zone no longer needs to be able to see any cpus.
5033                  */
5034                 zone_pset_set(zone, ZONE_PS_INVAL);
5035                 mutex_exit(&cpu_lock);
5036         }
5037         pool_unlock();
5038 
5039         /*
5040          * ZSD shutdown callbacks can be executed multiple times, hence
5041          * it is safe to not be holding any locks across this call.
5042          */
5043         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5044 
5045         mutex_enter(&zone_status_lock);
5046         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5047                 zone_status_set(zone, ZONE_IS_DOWN);
5048         mutex_exit(&zone_status_lock);
5049 
5050         /*
5051          * Wait for kernel threads to drain.
5052          */
5053         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5054                 zone_rele(zone);
5055                 return (set_errno(EINTR));
5056         }
5057 
5058         /*
5059          * Zone can be become down/destroyable even if the above wait
5060          * returns EINTR, so any code added here may never execute.
5061          * (i.e. don't add code here)
5062          */
5063 
5064         zone_rele(zone);
5065         return (0);
5066 }
5067 
5068 /*
5069  * Log the specified zone's reference counts.  The caller should not be
5070  * holding the zone's zone_lock.
5071  */
5072 static void
5073 zone_log_refcounts(zone_t *zone)
5074 {
5075         char *buffer;
5076         char *buffer_position;
5077         uint32_t buffer_size;
5078         uint32_t index;
5079         uint_t ref;
5080         uint_t cred_ref;
5081 
5082         /*
5083          * Construct a string representing the subsystem-specific reference
5084          * counts.  The counts are printed in ascending order by index into the
5085          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5086          * square brackets [] and will only contain nonzero reference counts.
5087          *
5088          * The buffer will hold two square bracket characters plus ten digits,
5089          * one colon, one space, one comma, and some characters for a
5090          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5091          * bit integers have at most ten decimal digits.)  The last
5092          * reference count's comma is replaced by the closing square
5093          * bracket and a NULL character to terminate the string.
5094          *
5095          * NOTE: We have to grab the zone's zone_lock to create a consistent
5096          * snapshot of the zone's reference counters.
5097          *
5098          * First, figure out how much space the string buffer will need.
5099          * The buffer's size is stored in buffer_size.
5100          */
5101         buffer_size = 2;                        /* for the square brackets */
5102         mutex_enter(&zone->zone_lock);
5103         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5104         ref = zone->zone_ref;
5105         cred_ref = zone->zone_cred_ref;
5106         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5107                 if (zone->zone_subsys_ref[index] != 0)
5108                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5109                             13;
5110         if (buffer_size == 2) {
5111                 /*
5112                  * No subsystems had nonzero reference counts.  Don't bother
5113                  * with allocating a buffer; just log the general-purpose and
5114                  * credential reference counts.
5115                  */
5116                 mutex_exit(&zone->zone_lock);
5117                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5118                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5119                     "references and %u credential references are still extant",
5120                     zone->zone_name, zone->zone_id, ref, cred_ref);
5121                 return;
5122         }
5123 
5124         /*
5125          * buffer_size contains the exact number of characters that the
5126          * buffer will need.  Allocate the buffer and fill it with nonzero
5127          * subsystem-specific reference counts.  Surround the results with
5128          * square brackets afterwards.
5129          */
5130         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5131         buffer_position = &buffer[1];
5132         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5133                 /*
5134                  * NOTE: The DDI's version of sprintf() returns a pointer to
5135                  * the modified buffer rather than the number of bytes written
5136                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5137                  * Therefore, we'll use snprintf() with INT_MAX to get the
5138                  * number of bytes written.  Using INT_MAX is safe because
5139                  * the buffer is perfectly sized for the data: we'll never
5140                  * overrun the buffer.
5141                  */
5142                 if (zone->zone_subsys_ref[index] != 0)
5143                         buffer_position += snprintf(buffer_position, INT_MAX,
5144                             "%s: %u,", zone_ref_subsys_names[index],
5145                             zone->zone_subsys_ref[index]);
5146         }
5147         mutex_exit(&zone->zone_lock);
5148         buffer[0] = '[';
5149         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5150         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5151         buffer_position[-1] = ']';
5152 
5153         /*
5154          * Log the reference counts and free the message buffer.
5155          */
5156         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5157             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5158             "%u credential references are still extant %s", zone->zone_name,
5159             zone->zone_id, ref, cred_ref, buffer);
5160         kmem_free(buffer, buffer_size);
5161 }
5162 
5163 /*
5164  * Systemcall entry point to finalize the zone halt process.  The caller
5165  * must have already successfully called zone_shutdown().
5166  *
5167  * Upon successful completion, the zone will have been fully destroyed:
5168  * zsched will have exited, destructor callbacks executed, and the zone
5169  * removed from the list of active zones.
5170  */
5171 static int
5172 zone_destroy(zoneid_t zoneid)
5173 {
5174         uint64_t uniqid;
5175         zone_t *zone;
5176         zone_status_t status;
5177         clock_t wait_time;
5178         boolean_t log_refcounts;
5179 
5180         if (secpolicy_zone_config(CRED()) != 0)
5181                 return (set_errno(EPERM));
5182         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5183                 return (set_errno(EINVAL));
5184 
5185         mutex_enter(&zonehash_lock);
5186         /*
5187          * Look for zone under hash lock to prevent races with other
5188          * calls to zone_destroy.
5189          */
5190         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5191                 mutex_exit(&zonehash_lock);
5192                 return (set_errno(EINVAL));
5193         }
5194 
5195         if (zone_mount_count(zone->zone_rootpath) != 0) {
5196                 mutex_exit(&zonehash_lock);
5197                 return (set_errno(EBUSY));
5198         }
5199         mutex_enter(&zone_status_lock);
5200         status = zone_status_get(zone);
5201         if (status < ZONE_IS_DOWN) {
5202                 mutex_exit(&zone_status_lock);
5203                 mutex_exit(&zonehash_lock);
5204                 return (set_errno(EBUSY));
5205         } else if (status == ZONE_IS_DOWN) {
5206                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5207         }
5208         mutex_exit(&zone_status_lock);
5209         zone_hold(zone);
5210         mutex_exit(&zonehash_lock);
5211 
5212         /*
5213          * wait for zsched to exit
5214          */
5215         zone_status_wait(zone, ZONE_IS_DEAD);
5216         zone_zsd_callbacks(zone, ZSD_DESTROY);
5217         zone->zone_netstack = NULL;
5218         uniqid = zone->zone_uniqid;
5219         zone_rele(zone);
5220         zone = NULL;    /* potentially free'd */
5221 
5222         log_refcounts = B_FALSE;
5223         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5224         mutex_enter(&zonehash_lock);
5225         for (; /* ever */; ) {
5226                 boolean_t unref;
5227                 boolean_t refs_have_been_logged;
5228 
5229                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5230                     zone->zone_uniqid != uniqid) {
5231                         /*
5232                          * The zone has gone away.  Necessary conditions
5233                          * are met, so we return success.
5234                          */
5235                         mutex_exit(&zonehash_lock);
5236                         return (0);
5237                 }
5238                 mutex_enter(&zone->zone_lock);
5239                 unref = ZONE_IS_UNREF(zone);
5240                 refs_have_been_logged = (zone->zone_flags &
5241                     ZF_REFCOUNTS_LOGGED);
5242                 mutex_exit(&zone->zone_lock);
5243                 if (unref) {
5244                         /*
5245                          * There is only one reference to the zone -- that
5246                          * added when the zone was added to the hashtables --
5247                          * and things will remain this way until we drop
5248                          * zonehash_lock... we can go ahead and cleanup the
5249                          * zone.
5250                          */
5251                         break;
5252                 }
5253 
5254                 /*
5255                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5256                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5257                  * some zone's general-purpose reference count reaches one.
5258                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5259                  * on zone_destroy_cv, then log the zone's reference counts and
5260                  * continue to wait for zone_rele() and zone_cred_rele().
5261                  */
5262                 if (!refs_have_been_logged) {
5263                         if (!log_refcounts) {
5264                                 /*
5265                                  * This thread hasn't timed out waiting on
5266                                  * zone_destroy_cv yet.  Wait wait_time clock
5267                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5268                                  * seconds) for the zone's references to clear.
5269                                  */
5270                                 ASSERT(wait_time > 0);
5271                                 wait_time = cv_reltimedwait_sig(
5272                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5273                                     TR_SEC);
5274                                 if (wait_time > 0) {
5275                                         /*
5276                                          * A thread in zone_rele() or
5277                                          * zone_cred_rele() signaled
5278                                          * zone_destroy_cv before this thread's
5279                                          * wait timed out.  The zone might have
5280                                          * only one reference left; find out!
5281                                          */
5282                                         continue;
5283                                 } else if (wait_time == 0) {
5284                                         /* The thread's process was signaled. */
5285                                         mutex_exit(&zonehash_lock);
5286                                         return (set_errno(EINTR));
5287                                 }
5288 
5289                                 /*
5290                                  * The thread timed out while waiting on
5291                                  * zone_destroy_cv.  Even though the thread
5292                                  * timed out, it has to check whether another
5293                                  * thread woke up from zone_destroy_cv and
5294                                  * destroyed the zone.
5295                                  *
5296                                  * If the zone still exists and has more than
5297                                  * one unreleased general-purpose reference,
5298                                  * then log the zone's reference counts.
5299                                  */
5300                                 log_refcounts = B_TRUE;
5301                                 continue;
5302                         }
5303 
5304                         /*
5305                          * The thread already timed out on zone_destroy_cv while
5306                          * waiting for subsystems to release the zone's last
5307                          * general-purpose references.  Log the zone's reference
5308                          * counts and wait indefinitely on zone_destroy_cv.
5309                          */
5310                         zone_log_refcounts(zone);
5311                 }
5312                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5313                         /* The thread's process was signaled. */
5314                         mutex_exit(&zonehash_lock);
5315                         return (set_errno(EINTR));
5316                 }
5317         }
5318 
5319         /*
5320          * Remove CPU cap for this zone now since we're not going to
5321          * fail below this point.
5322          */
5323         cpucaps_zone_remove(zone);
5324 
5325         /* Get rid of the zone's kstats */
5326         zone_kstat_delete(zone);
5327 
5328         /* remove the pfexecd doors */
5329         if (zone->zone_pfexecd != NULL) {
5330                 klpd_freelist(&zone->zone_pfexecd);
5331                 zone->zone_pfexecd = NULL;
5332         }
5333 
5334         /* free brand specific data */
5335         if (ZONE_IS_BRANDED(zone))
5336                 ZBROP(zone)->b_free_brand_data(zone);
5337 
5338         /* Say goodbye to brand framework. */
5339         brand_unregister_zone(zone->zone_brand);
5340 
5341         /*
5342          * It is now safe to let the zone be recreated; remove it from the
5343          * lists.  The memory will not be freed until the last cred
5344          * reference goes away.
5345          */
5346         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5347         zonecount--;
5348         /* remove from active list and hash tables */
5349         list_remove(&zone_active, zone);
5350         (void) mod_hash_destroy(zonehashbyname,
5351             (mod_hash_key_t)zone->zone_name);
5352         (void) mod_hash_destroy(zonehashbyid,
5353             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5354         if (zone->zone_flags & ZF_HASHED_LABEL)
5355                 (void) mod_hash_destroy(zonehashbylabel,
5356                     (mod_hash_key_t)zone->zone_slabel);
5357         mutex_exit(&zonehash_lock);
5358 
5359         /*
5360          * Release the root vnode; we're not using it anymore.  Nor should any
5361          * other thread that might access it exist.
5362          */
5363         if (zone->zone_rootvp != NULL) {
5364                 VN_RELE(zone->zone_rootvp);
5365                 zone->zone_rootvp = NULL;
5366         }
5367 
5368         /* add to deathrow list */
5369         mutex_enter(&zone_deathrow_lock);
5370         list_insert_tail(&zone_deathrow, zone);
5371         mutex_exit(&zone_deathrow_lock);
5372 
5373         /*
5374          * Drop last reference (which was added by zsched()), this will
5375          * free the zone unless there are outstanding cred references.
5376          */
5377         zone_rele(zone);
5378         return (0);
5379 }
5380 
5381 /*
5382  * Systemcall entry point for zone_getattr(2).
5383  */
5384 static ssize_t
5385 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5386 {
5387         size_t size;
5388         int error = 0, err;
5389         zone_t *zone;
5390         char *zonepath;
5391         char *outstr;
5392         zone_status_t zone_status;
5393         pid_t initpid;
5394         boolean_t global = (curzone == global_zone);
5395         boolean_t inzone = (curzone->zone_id == zoneid);
5396         ushort_t flags;
5397         zone_net_data_t *zbuf;
5398 
5399         mutex_enter(&zonehash_lock);
5400         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5401                 mutex_exit(&zonehash_lock);
5402                 return (set_errno(EINVAL));
5403         }
5404         zone_status = zone_status_get(zone);
5405         if (zone_status < ZONE_IS_INITIALIZED) {
5406                 mutex_exit(&zonehash_lock);
5407                 return (set_errno(EINVAL));
5408         }
5409         zone_hold(zone);
5410         mutex_exit(&zonehash_lock);
5411 
5412         /*
5413          * If not in the global zone, don't show information about other zones,
5414          * unless the system is labeled and the local zone's label dominates
5415          * the other zone.
5416          */
5417         if (!zone_list_access(zone)) {
5418                 zone_rele(zone);
5419                 return (set_errno(EINVAL));
5420         }
5421 
5422         switch (attr) {
5423         case ZONE_ATTR_ROOT:
5424                 if (global) {
5425                         /*
5426                          * Copy the path to trim the trailing "/" (except for
5427                          * the global zone).
5428                          */
5429                         if (zone != global_zone)
5430                                 size = zone->zone_rootpathlen - 1;
5431                         else
5432                                 size = zone->zone_rootpathlen;
5433                         zonepath = kmem_alloc(size, KM_SLEEP);
5434                         bcopy(zone->zone_rootpath, zonepath, size);
5435                         zonepath[size - 1] = '\0';
5436                 } else {
5437                         if (inzone || !is_system_labeled()) {
5438                                 /*
5439                                  * Caller is not in the global zone.
5440                                  * if the query is on the current zone
5441                                  * or the system is not labeled,
5442                                  * just return faked-up path for current zone.
5443                                  */
5444                                 zonepath = "/";
5445                                 size = 2;
5446                         } else {
5447                                 /*
5448                                  * Return related path for current zone.
5449                                  */
5450                                 int prefix_len = strlen(zone_prefix);
5451                                 int zname_len = strlen(zone->zone_name);
5452 
5453                                 size = prefix_len + zname_len + 1;
5454                                 zonepath = kmem_alloc(size, KM_SLEEP);
5455                                 bcopy(zone_prefix, zonepath, prefix_len);
5456                                 bcopy(zone->zone_name, zonepath +
5457                                     prefix_len, zname_len);
5458                                 zonepath[size - 1] = '\0';
5459                         }
5460                 }
5461                 if (bufsize > size)
5462                         bufsize = size;
5463                 if (buf != NULL) {
5464                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5465                         if (err != 0 && err != ENAMETOOLONG)
5466                                 error = EFAULT;
5467                 }
5468                 if (global || (is_system_labeled() && !inzone))
5469                         kmem_free(zonepath, size);
5470                 break;
5471 
5472         case ZONE_ATTR_NAME:
5473                 size = strlen(zone->zone_name) + 1;
5474                 if (bufsize > size)
5475                         bufsize = size;
5476                 if (buf != NULL) {
5477                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5478                         if (err != 0 && err != ENAMETOOLONG)
5479                                 error = EFAULT;
5480                 }
5481                 break;
5482 
5483         case ZONE_ATTR_STATUS:
5484                 /*
5485                  * Since we're not holding zonehash_lock, the zone status
5486                  * may be anything; leave it up to userland to sort it out.
5487                  */
5488                 size = sizeof (zone_status);
5489                 if (bufsize > size)
5490                         bufsize = size;
5491                 zone_status = zone_status_get(zone);
5492                 if (buf != NULL &&
5493                     copyout(&zone_status, buf, bufsize) != 0)
5494                         error = EFAULT;
5495                 break;
5496         case ZONE_ATTR_FLAGS:
5497                 size = sizeof (zone->zone_flags);
5498                 if (bufsize > size)
5499                         bufsize = size;
5500                 flags = zone->zone_flags;
5501                 if (buf != NULL &&
5502                     copyout(&flags, buf, bufsize) != 0)
5503                         error = EFAULT;
5504                 break;
5505         case ZONE_ATTR_PRIVSET:
5506                 size = sizeof (priv_set_t);
5507                 if (bufsize > size)
5508                         bufsize = size;
5509                 if (buf != NULL &&
5510                     copyout(zone->zone_privset, buf, bufsize) != 0)
5511                         error = EFAULT;
5512                 break;
5513         case ZONE_ATTR_UNIQID:
5514                 size = sizeof (zone->zone_uniqid);
5515                 if (bufsize > size)
5516                         bufsize = size;
5517                 if (buf != NULL &&
5518                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5519                         error = EFAULT;
5520                 break;
5521         case ZONE_ATTR_POOLID:
5522                 {
5523                         pool_t *pool;
5524                         poolid_t poolid;
5525 
5526                         if (pool_lock_intr() != 0) {
5527                                 error = EINTR;
5528                                 break;
5529                         }
5530                         pool = zone_pool_get(zone);
5531                         poolid = pool->pool_id;
5532                         pool_unlock();
5533                         size = sizeof (poolid);
5534                         if (bufsize > size)
5535                                 bufsize = size;
5536                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5537                                 error = EFAULT;
5538                 }
5539                 break;
5540         case ZONE_ATTR_SLBL:
5541                 size = sizeof (bslabel_t);
5542                 if (bufsize > size)
5543                         bufsize = size;
5544                 if (zone->zone_slabel == NULL)
5545                         error = EINVAL;
5546                 else if (buf != NULL &&
5547                     copyout(label2bslabel(zone->zone_slabel), buf,
5548                     bufsize) != 0)
5549                         error = EFAULT;
5550                 break;
5551         case ZONE_ATTR_INITPID:
5552                 size = sizeof (initpid);
5553                 if (bufsize > size)
5554                         bufsize = size;
5555                 initpid = zone->zone_proc_initpid;
5556                 if (initpid == -1) {
5557                         error = ESRCH;
5558                         break;
5559                 }
5560                 if (buf != NULL &&
5561                     copyout(&initpid, buf, bufsize) != 0)
5562                         error = EFAULT;
5563                 break;
5564         case ZONE_ATTR_BRAND:
5565                 size = strlen(zone->zone_brand->b_name) + 1;
5566 
5567                 if (bufsize > size)
5568                         bufsize = size;
5569                 if (buf != NULL) {
5570                         err = copyoutstr(zone->zone_brand->b_name, buf,
5571                             bufsize, NULL);
5572                         if (err != 0 && err != ENAMETOOLONG)
5573                                 error = EFAULT;
5574                 }
5575                 break;
5576         case ZONE_ATTR_INITNAME:
5577                 size = strlen(zone->zone_initname) + 1;
5578                 if (bufsize > size)
5579                         bufsize = size;
5580                 if (buf != NULL) {
5581                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5582                             NULL);
5583                         if (err != 0 && err != ENAMETOOLONG)
5584                                 error = EFAULT;
5585                 }
5586                 break;
5587         case ZONE_ATTR_BOOTARGS:
5588                 if (zone->zone_bootargs == NULL)
5589                         outstr = "";
5590                 else
5591                         outstr = zone->zone_bootargs;
5592                 size = strlen(outstr) + 1;
5593                 if (bufsize > size)
5594                         bufsize = size;
5595                 if (buf != NULL) {
5596                         err = copyoutstr(outstr, buf, bufsize, NULL);
5597                         if (err != 0 && err != ENAMETOOLONG)
5598                                 error = EFAULT;
5599                 }
5600                 break;
5601         case ZONE_ATTR_PHYS_MCAP:
5602                 size = sizeof (zone->zone_phys_mcap);
5603                 if (bufsize > size)
5604                         bufsize = size;
5605                 if (buf != NULL &&
5606                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5607                         error = EFAULT;
5608                 break;
5609         case ZONE_ATTR_SCHED_CLASS:
5610                 mutex_enter(&class_lock);
5611 
5612                 if (zone->zone_defaultcid >= loaded_classes)
5613                         outstr = "";
5614                 else
5615                         outstr = sclass[zone->zone_defaultcid].cl_name;
5616                 size = strlen(outstr) + 1;
5617                 if (bufsize > size)
5618                         bufsize = size;
5619                 if (buf != NULL) {
5620                         err = copyoutstr(outstr, buf, bufsize, NULL);
5621                         if (err != 0 && err != ENAMETOOLONG)
5622                                 error = EFAULT;
5623                 }
5624 
5625                 mutex_exit(&class_lock);
5626                 break;
5627         case ZONE_ATTR_HOSTID:
5628                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5629                     bufsize == sizeof (zone->zone_hostid)) {
5630                         size = sizeof (zone->zone_hostid);
5631                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5632                             bufsize) != 0)
5633                                 error = EFAULT;
5634                 } else {
5635                         error = EINVAL;
5636                 }
5637                 break;
5638         case ZONE_ATTR_FS_ALLOWED:
5639                 if (zone->zone_fs_allowed == NULL)
5640                         outstr = "";
5641                 else
5642                         outstr = zone->zone_fs_allowed;
5643                 size = strlen(outstr) + 1;
5644                 if (bufsize > size)
5645                         bufsize = size;
5646                 if (buf != NULL) {
5647                         err = copyoutstr(outstr, buf, bufsize, NULL);
5648                         if (err != 0 && err != ENAMETOOLONG)
5649                                 error = EFAULT;
5650                 }
5651                 break;
5652         case ZONE_ATTR_SECFLAGS:
5653                 size = sizeof (zone->zone_secflags);
5654                 if (bufsize > size)
5655                         bufsize = size;
5656                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5657                         error = EFAULT;
5658                 break;
5659         case ZONE_ATTR_NETWORK:
5660                 bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5661                 size = bufsize;
5662                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5663                 if (copyin(buf, zbuf, bufsize) != 0) {
5664                         error = EFAULT;
5665                 } else {
5666                         error = zone_get_network(zoneid, zbuf);
5667                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5668                                 error = EFAULT;
5669                 }
5670                 kmem_free(zbuf, bufsize);
5671                 break;
5672         default:
5673                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5674                         size = bufsize;
5675                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5676                 } else {
5677                         error = EINVAL;
5678                 }
5679         }
5680         zone_rele(zone);
5681 
5682         if (error)
5683                 return (set_errno(error));
5684         return ((ssize_t)size);
5685 }
5686 
5687 /*
5688  * Systemcall entry point for zone_setattr(2).
5689  */
5690 /*ARGSUSED*/
5691 static int
5692 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5693 {
5694         zone_t *zone;
5695         zone_status_t zone_status;
5696         int err = -1;
5697         zone_net_data_t *zbuf;
5698 
5699         if (secpolicy_zone_config(CRED()) != 0)
5700                 return (set_errno(EPERM));
5701 
5702         /*
5703          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5704          * global zone.
5705          */
5706         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5707                 return (set_errno(EINVAL));
5708         }
5709 
5710         mutex_enter(&zonehash_lock);
5711         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5712                 mutex_exit(&zonehash_lock);
5713                 return (set_errno(EINVAL));
5714         }
5715         zone_hold(zone);
5716         mutex_exit(&zonehash_lock);
5717 
5718         /*
5719          * At present most attributes can only be set on non-running,
5720          * non-global zones.
5721          */
5722         zone_status = zone_status_get(zone);
5723         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5724                 err = EINVAL;
5725                 goto done;
5726         }
5727 
5728         switch (attr) {
5729         case ZONE_ATTR_INITNAME:
5730                 err = zone_set_initname(zone, (const char *)buf);
5731                 break;
5732         case ZONE_ATTR_INITNORESTART:
5733                 zone->zone_restart_init = B_FALSE;
5734                 err = 0;
5735                 break;
5736         case ZONE_ATTR_BOOTARGS:
5737                 err = zone_set_bootargs(zone, (const char *)buf);
5738                 break;
5739         case ZONE_ATTR_BRAND:
5740                 err = zone_set_brand(zone, (const char *)buf);
5741                 break;
5742         case ZONE_ATTR_FS_ALLOWED:
5743                 err = zone_set_fs_allowed(zone, (const char *)buf);
5744                 break;
5745         case ZONE_ATTR_SECFLAGS:
5746                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5747                 break;
5748         case ZONE_ATTR_PHYS_MCAP:
5749                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5750                 break;
5751         case ZONE_ATTR_SCHED_CLASS:
5752                 err = zone_set_sched_class(zone, (const char *)buf);
5753                 break;
5754         case ZONE_ATTR_HOSTID:
5755                 if (bufsize == sizeof (zone->zone_hostid)) {
5756                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5757                                 err = 0;
5758                         else
5759                                 err = EFAULT;
5760                 } else {
5761                         err = EINVAL;
5762                 }
5763                 break;
5764         case ZONE_ATTR_NETWORK:
5765                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5766                         err = EINVAL;
5767                         break;
5768                 }
5769                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5770                 if (copyin(buf, zbuf, bufsize) != 0) {
5771                         kmem_free(zbuf, bufsize);
5772                         err = EFAULT;
5773                         break;
5774                 }
5775                 err = zone_set_network(zoneid, zbuf);
5776                 kmem_free(zbuf, bufsize);
5777                 break;
5778         default:
5779                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5780                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5781                 else
5782                         err = EINVAL;
5783         }
5784 
5785 done:
5786         zone_rele(zone);
5787         ASSERT(err != -1);
5788         return (err != 0 ? set_errno(err) : 0);
5789 }
5790 
5791 /*
5792  * Return zero if the process has at least one vnode mapped in to its
5793  * address space which shouldn't be allowed to change zones.
5794  *
5795  * Also return zero if the process has any shared mappings which reserve
5796  * swap.  This is because the counting for zone.max-swap does not allow swap
5797  * reservation to be shared between zones.  zone swap reservation is counted
5798  * on zone->zone_max_swap.
5799  */
5800 static int
5801 as_can_change_zones(void)
5802 {
5803         proc_t *pp = curproc;
5804         struct seg *seg;
5805         struct as *as = pp->p_as;
5806         vnode_t *vp;
5807         int allow = 1;
5808 
5809         ASSERT(pp->p_as != &kas);
5810         AS_LOCK_ENTER(as, RW_READER);
5811         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5812 
5813                 /*
5814                  * Cannot enter zone with shared anon memory which
5815                  * reserves swap.  See comment above.
5816                  */
5817                 if (seg_can_change_zones(seg) == B_FALSE) {
5818                         allow = 0;
5819                         break;
5820                 }
5821                 /*
5822                  * if we can't get a backing vnode for this segment then skip
5823                  * it.
5824                  */
5825                 vp = NULL;
5826                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5827                         continue;
5828                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5829                         allow = 0;
5830                         break;
5831                 }
5832         }
5833         AS_LOCK_EXIT(as);
5834         return (allow);
5835 }
5836 
5837 /*
5838  * Count swap reserved by curproc's address space
5839  */
5840 static size_t
5841 as_swresv(void)
5842 {
5843         proc_t *pp = curproc;
5844         struct seg *seg;
5845         struct as *as = pp->p_as;
5846         size_t swap = 0;
5847 
5848         ASSERT(pp->p_as != &kas);
5849         ASSERT(AS_WRITE_HELD(as));
5850         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5851                 swap += seg_swresv(seg);
5852 
5853         return (swap);
5854 }
5855 
5856 /*
5857  * Systemcall entry point for zone_enter().
5858  *
5859  * The current process is injected into said zone.  In the process
5860  * it will change its project membership, privileges, rootdir/cwd,
5861  * zone-wide rctls, and pool association to match those of the zone.
5862  *
5863  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5864  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5865  * enter a zone that is "ready" or "running".
5866  */
5867 static int
5868 zone_enter(zoneid_t zoneid)
5869 {
5870         zone_t *zone;
5871         vnode_t *vp;
5872         proc_t *pp = curproc;
5873         contract_t *ct;
5874         cont_process_t *ctp;
5875         task_t *tk, *oldtk;
5876         kproject_t *zone_proj0;
5877         cred_t *cr, *newcr;
5878         pool_t *oldpool, *newpool;
5879         sess_t *sp;
5880         uid_t uid;
5881         zone_status_t status;
5882         int err = 0;
5883         rctl_entity_p_t e;
5884         size_t swap;
5885         kthread_id_t t;
5886 
5887         if (secpolicy_zone_config(CRED()) != 0)
5888                 return (set_errno(EPERM));
5889         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5890                 return (set_errno(EINVAL));
5891 
5892         /*
5893          * Stop all lwps so we don't need to hold a lock to look at
5894          * curproc->p_zone.  This needs to happen before we grab any
5895          * locks to avoid deadlock (another lwp in the process could
5896          * be waiting for the held lock).
5897          */
5898         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5899                 return (set_errno(EINTR));
5900 
5901         /*
5902          * Make sure we're not changing zones with files open or mapped in
5903          * to our address space which shouldn't be changing zones.
5904          */
5905         if (!files_can_change_zones()) {
5906                 err = EBADF;
5907                 goto out;
5908         }
5909         if (!as_can_change_zones()) {
5910                 err = EFAULT;
5911                 goto out;
5912         }
5913 
5914         mutex_enter(&zonehash_lock);
5915         if (pp->p_zone != global_zone) {
5916                 mutex_exit(&zonehash_lock);
5917                 err = EINVAL;
5918                 goto out;
5919         }
5920 
5921         zone = zone_find_all_by_id(zoneid);
5922         if (zone == NULL) {
5923                 mutex_exit(&zonehash_lock);
5924                 err = EINVAL;
5925                 goto out;
5926         }
5927 
5928         /*
5929          * To prevent processes in a zone from holding contracts on
5930          * extrazonal resources, and to avoid process contract
5931          * memberships which span zones, contract holders and processes
5932          * which aren't the sole members of their encapsulating process
5933          * contracts are not allowed to zone_enter.
5934          */
5935         ctp = pp->p_ct_process;
5936         ct = &ctp->conp_contract;
5937         mutex_enter(&ct->ct_lock);
5938         mutex_enter(&pp->p_lock);
5939         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5940                 mutex_exit(&pp->p_lock);
5941                 mutex_exit(&ct->ct_lock);
5942                 mutex_exit(&zonehash_lock);
5943                 err = EINVAL;
5944                 goto out;
5945         }
5946 
5947         /*
5948          * Moreover, we don't allow processes whose encapsulating
5949          * process contracts have inherited extrazonal contracts.
5950          * While it would be easier to eliminate all process contracts
5951          * with inherited contracts, we need to be able to give a
5952          * restarted init (or other zone-penetrating process) its
5953          * predecessor's contracts.
5954          */
5955         if (ctp->conp_ninherited != 0) {
5956                 contract_t *next;
5957                 for (next = list_head(&ctp->conp_inherited); next;
5958                     next = list_next(&ctp->conp_inherited, next)) {
5959                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5960                                 mutex_exit(&pp->p_lock);
5961                                 mutex_exit(&ct->ct_lock);
5962                                 mutex_exit(&zonehash_lock);
5963                                 err = EINVAL;
5964                                 goto out;
5965                         }
5966                 }
5967         }
5968 
5969         mutex_exit(&pp->p_lock);
5970         mutex_exit(&ct->ct_lock);
5971 
5972         status = zone_status_get(zone);
5973         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5974                 /*
5975                  * Can't join
5976                  */
5977                 mutex_exit(&zonehash_lock);
5978                 err = EINVAL;
5979                 goto out;
5980         }
5981 
5982         /*
5983          * Make sure new priv set is within the permitted set for caller
5984          */
5985         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5986                 mutex_exit(&zonehash_lock);
5987                 err = EPERM;
5988                 goto out;
5989         }
5990         /*
5991          * We want to momentarily drop zonehash_lock while we optimistically
5992          * bind curproc to the pool it should be running in.  This is safe
5993          * since the zone can't disappear (we have a hold on it).
5994          */
5995         zone_hold(zone);
5996         mutex_exit(&zonehash_lock);
5997 
5998         /*
5999          * Grab pool_lock to keep the pools configuration from changing
6000          * and to stop ourselves from getting rebound to another pool
6001          * until we join the zone.
6002          */
6003         if (pool_lock_intr() != 0) {
6004                 zone_rele(zone);
6005                 err = EINTR;
6006                 goto out;
6007         }
6008         ASSERT(secpolicy_pool(CRED()) == 0);
6009         /*
6010          * Bind ourselves to the pool currently associated with the zone.
6011          */
6012         oldpool = curproc->p_pool;
6013         newpool = zone_pool_get(zone);
6014         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6015             (err = pool_do_bind(newpool, P_PID, P_MYID,
6016             POOL_BIND_ALL)) != 0) {
6017                 pool_unlock();
6018                 zone_rele(zone);
6019                 goto out;
6020         }
6021 
6022         /*
6023          * Grab cpu_lock now; we'll need it later when we call
6024          * task_join().
6025          */
6026         mutex_enter(&cpu_lock);
6027         mutex_enter(&zonehash_lock);
6028         /*
6029          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6030          */
6031         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6032                 /*
6033                  * Can't join anymore.
6034                  */
6035                 mutex_exit(&zonehash_lock);
6036                 mutex_exit(&cpu_lock);
6037                 if (pool_state == POOL_ENABLED &&
6038                     newpool != oldpool)
6039                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6040                             POOL_BIND_ALL);
6041                 pool_unlock();
6042                 zone_rele(zone);
6043                 err = EINVAL;
6044                 goto out;
6045         }
6046 
6047         /*
6048          * a_lock must be held while transfering locked memory and swap
6049          * reservation from the global zone to the non global zone because
6050          * asynchronous faults on the processes' address space can lock
6051          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6052          * segments respectively.
6053          */
6054         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6055         swap = as_swresv();
6056         mutex_enter(&pp->p_lock);
6057         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6058         /* verify that we do not exceed and task or lwp limits */
6059         mutex_enter(&zone->zone_nlwps_lock);
6060         /* add new lwps to zone and zone's proj0 */
6061         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6062         zone->zone_nlwps += pp->p_lwpcnt;
6063         /* add 1 task to zone's proj0 */
6064         zone_proj0->kpj_ntasks += 1;
6065 
6066         zone_proj0->kpj_nprocs++;
6067         zone->zone_nprocs++;
6068         mutex_exit(&zone->zone_nlwps_lock);
6069 
6070         mutex_enter(&zone->zone_mem_lock);
6071         zone->zone_locked_mem += pp->p_locked_mem;
6072         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6073         zone->zone_max_swap += swap;
6074         mutex_exit(&zone->zone_mem_lock);
6075 
6076         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6077         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6078         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6079 
6080         /* remove lwps and process from proc's old zone and old project */
6081         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6082         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6083         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6084         pp->p_task->tk_proj->kpj_nprocs--;
6085         pp->p_zone->zone_nprocs--;
6086         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6087 
6088         mutex_enter(&pp->p_zone->zone_mem_lock);
6089         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6090         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6091         pp->p_zone->zone_max_swap -= swap;
6092         mutex_exit(&pp->p_zone->zone_mem_lock);
6093 
6094         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6095         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6096         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6097 
6098         pp->p_flag |= SZONETOP;
6099         pp->p_zone = zone;
6100         mutex_exit(&pp->p_lock);
6101         AS_LOCK_EXIT(pp->p_as);
6102 
6103         /*
6104          * Joining the zone cannot fail from now on.
6105          *
6106          * This means that a lot of the following code can be commonized and
6107          * shared with zsched().
6108          */
6109 
6110         /*
6111          * If the process contract fmri was inherited, we need to
6112          * flag this so that any contract status will not leak
6113          * extra zone information, svc_fmri in this case
6114          */
6115         if (ctp->conp_svc_ctid != ct->ct_id) {
6116                 mutex_enter(&ct->ct_lock);
6117                 ctp->conp_svc_zone_enter = ct->ct_id;
6118                 mutex_exit(&ct->ct_lock);
6119         }
6120 
6121         /*
6122          * Reset the encapsulating process contract's zone.
6123          */
6124         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6125         contract_setzuniqid(ct, zone->zone_uniqid);
6126 
6127         /*
6128          * Create a new task and associate the process with the project keyed
6129          * by (projid,zoneid).
6130          *
6131          * We might as well be in project 0; the global zone's projid doesn't
6132          * make much sense in a zone anyhow.
6133          *
6134          * This also increments zone_ntasks, and returns with p_lock held.
6135          */
6136         tk = task_create(0, zone);
6137         oldtk = task_join(tk, 0);
6138         mutex_exit(&cpu_lock);
6139 
6140         /*
6141          * call RCTLOP_SET functions on this proc
6142          */
6143         e.rcep_p.zone = zone;
6144         e.rcep_t = RCENTITY_ZONE;
6145         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6146             RCD_CALLBACK);
6147         mutex_exit(&pp->p_lock);
6148 
6149         /*
6150          * We don't need to hold any of zsched's locks here; not only do we know
6151          * the process and zone aren't going away, we know its session isn't
6152          * changing either.
6153          *
6154          * By joining zsched's session here, we mimic the behavior in the
6155          * global zone of init's sid being the pid of sched.  We extend this
6156          * to all zlogin-like zone_enter()'ing processes as well.
6157          */
6158         mutex_enter(&pidlock);
6159         sp = zone->zone_zsched->p_sessp;
6160         sess_hold(zone->zone_zsched);
6161         mutex_enter(&pp->p_lock);
6162         pgexit(pp);
6163         sess_rele(pp->p_sessp, B_TRUE);
6164         pp->p_sessp = sp;
6165         pgjoin(pp, zone->zone_zsched->p_pidp);
6166 
6167         /*
6168          * If any threads are scheduled to be placed on zone wait queue they
6169          * should abandon the idea since the wait queue is changing.
6170          * We need to be holding pidlock & p_lock to do this.
6171          */
6172         if ((t = pp->p_tlist) != NULL) {
6173                 do {
6174                         thread_lock(t);
6175                         /*
6176                          * Kick this thread so that it doesn't sit
6177                          * on a wrong wait queue.
6178                          */
6179                         if (ISWAITING(t))
6180                                 setrun_locked(t);
6181 
6182                         if (t->t_schedflag & TS_ANYWAITQ)
6183                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6184 
6185                         thread_unlock(t);
6186                 } while ((t = t->t_forw) != pp->p_tlist);
6187         }
6188 
6189         /*
6190          * If there is a default scheduling class for the zone and it is not
6191          * the class we are currently in, change all of the threads in the
6192          * process to the new class.  We need to be holding pidlock & p_lock
6193          * when we call parmsset so this is a good place to do it.
6194          */
6195         if (zone->zone_defaultcid > 0 &&
6196             zone->zone_defaultcid != curthread->t_cid) {
6197                 pcparms_t pcparms;
6198 
6199                 pcparms.pc_cid = zone->zone_defaultcid;
6200                 pcparms.pc_clparms[0] = 0;
6201 
6202                 /*
6203                  * If setting the class fails, we still want to enter the zone.
6204                  */
6205                 if ((t = pp->p_tlist) != NULL) {
6206                         do {
6207                                 (void) parmsset(&pcparms, t);
6208                         } while ((t = t->t_forw) != pp->p_tlist);
6209                 }
6210         }
6211 
6212         mutex_exit(&pp->p_lock);
6213         mutex_exit(&pidlock);
6214 
6215         mutex_exit(&zonehash_lock);
6216         /*
6217          * We're firmly in the zone; let pools progress.
6218          */
6219         pool_unlock();
6220         task_rele(oldtk);
6221         /*
6222          * We don't need to retain a hold on the zone since we already
6223          * incremented zone_ntasks, so the zone isn't going anywhere.
6224          */
6225         zone_rele(zone);
6226 
6227         /*
6228          * Chroot
6229          */
6230         vp = zone->zone_rootvp;
6231         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6232         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6233 
6234         /*
6235          * Change process security flags.  Note that the _effective_ flags
6236          * cannot change
6237          */
6238         secflags_copy(&pp->p_secflags.psf_lower,
6239             &zone->zone_secflags.psf_lower);
6240         secflags_copy(&pp->p_secflags.psf_upper,
6241             &zone->zone_secflags.psf_upper);
6242         secflags_copy(&pp->p_secflags.psf_inherit,
6243             &zone->zone_secflags.psf_inherit);
6244 
6245         /*
6246          * Change process credentials
6247          */
6248         newcr = cralloc();
6249         mutex_enter(&pp->p_crlock);
6250         cr = pp->p_cred;
6251         crcopy_to(cr, newcr);
6252         crsetzone(newcr, zone);
6253         pp->p_cred = newcr;
6254 
6255         /*
6256          * Restrict all process privilege sets to zone limit
6257          */
6258         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6259         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6260         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6261         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6262         mutex_exit(&pp->p_crlock);
6263         crset(pp, newcr);
6264 
6265         /*
6266          * Adjust upcount to reflect zone entry.
6267          */
6268         uid = crgetruid(newcr);
6269         mutex_enter(&pidlock);
6270         upcount_dec(uid, GLOBAL_ZONEID);
6271         upcount_inc(uid, zoneid);
6272         mutex_exit(&pidlock);
6273 
6274         /*
6275          * Set up core file path and content.
6276          */
6277         set_core_defaults();
6278 
6279 out:
6280         /*
6281          * Let the other lwps continue.
6282          */
6283         mutex_enter(&pp->p_lock);
6284         if (curthread != pp->p_agenttp)
6285                 continuelwps(pp);
6286         mutex_exit(&pp->p_lock);
6287 
6288         return (err != 0 ? set_errno(err) : 0);
6289 }
6290 
6291 /*
6292  * Systemcall entry point for zone_list(2).
6293  *
6294  * Processes running in a (non-global) zone only see themselves.
6295  * On labeled systems, they see all zones whose label they dominate.
6296  */
6297 static int
6298 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6299 {
6300         zoneid_t *zoneids;
6301         zone_t *zone, *myzone;
6302         uint_t user_nzones, real_nzones;
6303         uint_t domi_nzones;
6304         int error;
6305 
6306         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6307                 return (set_errno(EFAULT));
6308 
6309         myzone = curproc->p_zone;
6310         if (myzone != global_zone) {
6311                 bslabel_t *mybslab;
6312 
6313                 if (!is_system_labeled()) {
6314                         /* just return current zone */
6315                         real_nzones = domi_nzones = 1;
6316                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6317                         zoneids[0] = myzone->zone_id;
6318                 } else {
6319                         /* return all zones that are dominated */
6320                         mutex_enter(&zonehash_lock);
6321                         real_nzones = zonecount;
6322                         domi_nzones = 0;
6323                         if (real_nzones > 0) {
6324                                 zoneids = kmem_alloc(real_nzones *
6325                                     sizeof (zoneid_t), KM_SLEEP);
6326                                 mybslab = label2bslabel(myzone->zone_slabel);
6327                                 for (zone = list_head(&zone_active);
6328                                     zone != NULL;
6329                                     zone = list_next(&zone_active, zone)) {
6330                                         if (zone->zone_id == GLOBAL_ZONEID)
6331                                                 continue;
6332                                         if (zone != myzone &&
6333                                             (zone->zone_flags & ZF_IS_SCRATCH))
6334                                                 continue;
6335                                         /*
6336                                          * Note that a label always dominates
6337                                          * itself, so myzone is always included
6338                                          * in the list.
6339                                          */
6340                                         if (bldominates(mybslab,
6341                                             label2bslabel(zone->zone_slabel))) {
6342                                                 zoneids[domi_nzones++] =
6343                                                     zone->zone_id;
6344                                         }
6345                                 }
6346                         }
6347                         mutex_exit(&zonehash_lock);
6348                 }
6349         } else {
6350                 mutex_enter(&zonehash_lock);
6351                 real_nzones = zonecount;
6352                 domi_nzones = 0;
6353                 if (real_nzones > 0) {
6354                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6355                             KM_SLEEP);
6356                         for (zone = list_head(&zone_active); zone != NULL;
6357                             zone = list_next(&zone_active, zone))
6358                                 zoneids[domi_nzones++] = zone->zone_id;
6359                         ASSERT(domi_nzones == real_nzones);
6360                 }
6361                 mutex_exit(&zonehash_lock);
6362         }
6363 
6364         /*
6365          * If user has allocated space for fewer entries than we found, then
6366          * return only up to their limit.  Either way, tell them exactly how
6367          * many we found.
6368          */
6369         if (domi_nzones < user_nzones)
6370                 user_nzones = domi_nzones;
6371         error = 0;
6372         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6373                 error = EFAULT;
6374         } else if (zoneidlist != NULL && user_nzones != 0) {
6375                 if (copyout(zoneids, zoneidlist,
6376                     user_nzones * sizeof (zoneid_t)) != 0)
6377                         error = EFAULT;
6378         }
6379 
6380         if (real_nzones > 0)
6381                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6382 
6383         if (error != 0)
6384                 return (set_errno(error));
6385         else
6386                 return (0);
6387 }
6388 
6389 /*
6390  * Systemcall entry point for zone_lookup(2).
6391  *
6392  * Non-global zones are only able to see themselves and (on labeled systems)
6393  * the zones they dominate.
6394  */
6395 static zoneid_t
6396 zone_lookup(const char *zone_name)
6397 {
6398         char *kname;
6399         zone_t *zone;
6400         zoneid_t zoneid;
6401         int err;
6402 
6403         if (zone_name == NULL) {
6404                 /* return caller's zone id */
6405                 return (getzoneid());
6406         }
6407 
6408         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6409         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6410                 kmem_free(kname, ZONENAME_MAX);
6411                 return (set_errno(err));
6412         }
6413 
6414         mutex_enter(&zonehash_lock);
6415         zone = zone_find_all_by_name(kname);
6416         kmem_free(kname, ZONENAME_MAX);
6417         /*
6418          * In a non-global zone, can only lookup global and own name.
6419          * In Trusted Extensions zone label dominance rules apply.
6420          */
6421         if (zone == NULL ||
6422             zone_status_get(zone) < ZONE_IS_READY ||
6423             !zone_list_access(zone)) {
6424                 mutex_exit(&zonehash_lock);
6425                 return (set_errno(EINVAL));
6426         } else {
6427                 zoneid = zone->zone_id;
6428                 mutex_exit(&zonehash_lock);
6429                 return (zoneid);
6430         }
6431 }
6432 
6433 static int
6434 zone_version(int *version_arg)
6435 {
6436         int version = ZONE_SYSCALL_API_VERSION;
6437 
6438         if (copyout(&version, version_arg, sizeof (int)) != 0)
6439                 return (set_errno(EFAULT));
6440         return (0);
6441 }
6442 
6443 /* ARGSUSED */
6444 long
6445 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6446 {
6447         zone_def zs;
6448         int err;
6449 
6450         switch (cmd) {
6451         case ZONE_CREATE:
6452                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6453                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6454                                 return (set_errno(EFAULT));
6455                         }
6456                 } else {
6457 #ifdef _SYSCALL32_IMPL
6458                         zone_def32 zs32;
6459 
6460                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6461                                 return (set_errno(EFAULT));
6462                         }
6463                         zs.zone_name =
6464                             (const char *)(unsigned long)zs32.zone_name;
6465                         zs.zone_root =
6466                             (const char *)(unsigned long)zs32.zone_root;
6467                         zs.zone_privs =
6468                             (const struct priv_set *)
6469                             (unsigned long)zs32.zone_privs;
6470                         zs.zone_privssz = zs32.zone_privssz;
6471                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6472                         zs.rctlbufsz = zs32.rctlbufsz;
6473                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6474                         zs.zfsbufsz = zs32.zfsbufsz;
6475                         zs.extended_error =
6476                             (int *)(unsigned long)zs32.extended_error;
6477                         zs.match = zs32.match;
6478                         zs.doi = zs32.doi;
6479                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6480                         zs.flags = zs32.flags;
6481 #else
6482                         panic("get_udatamodel() returned bogus result\n");
6483 #endif
6484                 }
6485 
6486                 return (zone_create(zs.zone_name, zs.zone_root,
6487                     zs.zone_privs, zs.zone_privssz,
6488                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6489                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6490                     zs.extended_error, zs.match, zs.doi,
6491                     zs.label, zs.flags));
6492         case ZONE_BOOT:
6493                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6494         case ZONE_DESTROY:
6495                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6496         case ZONE_GETATTR:
6497                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6498                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6499         case ZONE_SETATTR:
6500                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6501                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6502         case ZONE_ENTER:
6503                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6504         case ZONE_LIST:
6505                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6506         case ZONE_SHUTDOWN:
6507                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6508         case ZONE_LOOKUP:
6509                 return (zone_lookup((const char *)arg1));
6510         case ZONE_VERSION:
6511                 return (zone_version((int *)arg1));
6512         case ZONE_ADD_DATALINK:
6513                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6514                     (datalink_id_t)(uintptr_t)arg2));
6515         case ZONE_DEL_DATALINK:
6516                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6517                     (datalink_id_t)(uintptr_t)arg2));
6518         case ZONE_CHECK_DATALINK: {
6519                 zoneid_t        zoneid;
6520                 boolean_t       need_copyout;
6521 
6522                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6523                         return (EFAULT);
6524                 need_copyout = (zoneid == ALL_ZONES);
6525                 err = zone_check_datalink(&zoneid,
6526                     (datalink_id_t)(uintptr_t)arg2);
6527                 if (err == 0 && need_copyout) {
6528                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6529                                 err = EFAULT;
6530                 }
6531                 return (err == 0 ? 0 : set_errno(err));
6532         }
6533         case ZONE_LIST_DATALINK:
6534                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6535                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6536         default:
6537                 return (set_errno(EINVAL));
6538         }
6539 }
6540 
6541 struct zarg {
6542         zone_t *zone;
6543         zone_cmd_arg_t arg;
6544 };
6545 
6546 static int
6547 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6548 {
6549         char *buf;
6550         size_t buflen;
6551         int error;
6552 
6553         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6554         buf = kmem_alloc(buflen, KM_SLEEP);
6555         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6556         error = door_ki_open(buf, doorp);
6557         kmem_free(buf, buflen);
6558         return (error);
6559 }
6560 
6561 static void
6562 zone_release_door(door_handle_t *doorp)
6563 {
6564         door_ki_rele(*doorp);
6565         *doorp = NULL;
6566 }
6567 
6568 static void
6569 zone_ki_call_zoneadmd(struct zarg *zargp)
6570 {
6571         door_handle_t door = NULL;
6572         door_arg_t darg, save_arg;
6573         char *zone_name;
6574         size_t zone_namelen;
6575         zoneid_t zoneid;
6576         zone_t *zone;
6577         zone_cmd_arg_t arg;
6578         uint64_t uniqid;
6579         size_t size;
6580         int error;
6581         int retry;
6582 
6583         zone = zargp->zone;
6584         arg = zargp->arg;
6585         kmem_free(zargp, sizeof (*zargp));
6586 
6587         zone_namelen = strlen(zone->zone_name) + 1;
6588         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6589         bcopy(zone->zone_name, zone_name, zone_namelen);
6590         zoneid = zone->zone_id;
6591         uniqid = zone->zone_uniqid;
6592         /*
6593          * zoneadmd may be down, but at least we can empty out the zone.
6594          * We can ignore the return value of zone_empty() since we're called
6595          * from a kernel thread and know we won't be delivered any signals.
6596          */
6597         ASSERT(curproc == &p0);
6598         (void) zone_empty(zone);
6599         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6600         zone_rele(zone);
6601 
6602         size = sizeof (arg);
6603         darg.rbuf = (char *)&arg;
6604         darg.data_ptr = (char *)&arg;
6605         darg.rsize = size;
6606         darg.data_size = size;
6607         darg.desc_ptr = NULL;
6608         darg.desc_num = 0;
6609 
6610         save_arg = darg;
6611         /*
6612          * Since we're not holding a reference to the zone, any number of
6613          * things can go wrong, including the zone disappearing before we get a
6614          * chance to talk to zoneadmd.
6615          */
6616         for (retry = 0; /* forever */; retry++) {
6617                 if (door == NULL &&
6618                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6619                         goto next;
6620                 }
6621                 ASSERT(door != NULL);
6622 
6623                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6624                     SIZE_MAX, 0)) == 0) {
6625                         break;
6626                 }
6627                 switch (error) {
6628                 case EINTR:
6629                         /* FALLTHROUGH */
6630                 case EAGAIN:    /* process may be forking */
6631                         /*
6632                          * Back off for a bit
6633                          */
6634                         break;
6635                 case EBADF:
6636                         zone_release_door(&door);
6637                         if (zone_lookup_door(zone_name, &door) != 0) {
6638                                 /*
6639                                  * zoneadmd may be dead, but it may come back to
6640                                  * life later.
6641                                  */
6642                                 break;
6643                         }
6644                         break;
6645                 default:
6646                         cmn_err(CE_WARN,
6647                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6648                             error);
6649                         goto out;
6650                 }
6651 next:
6652                 /*
6653                  * If this isn't the same zone_t that we originally had in mind,
6654                  * then this is the same as if two kadmin requests come in at
6655                  * the same time: the first one wins.  This means we lose, so we
6656                  * bail.
6657                  */
6658                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6659                         /*
6660                          * Problem is solved.
6661                          */
6662                         break;
6663                 }
6664                 if (zone->zone_uniqid != uniqid) {
6665                         /*
6666                          * zoneid recycled
6667                          */
6668                         zone_rele(zone);
6669                         break;
6670                 }
6671                 /*
6672                  * We could zone_status_timedwait(), but there doesn't seem to
6673                  * be much point in doing that (plus, it would mean that
6674                  * zone_free() isn't called until this thread exits).
6675                  */
6676                 zone_rele(zone);
6677                 delay(hz);
6678                 darg = save_arg;
6679         }
6680 out:
6681         if (door != NULL) {
6682                 zone_release_door(&door);
6683         }
6684         kmem_free(zone_name, zone_namelen);
6685         thread_exit();
6686 }
6687 
6688 /*
6689  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6690  * kadmin().  The caller is a process in the zone.
6691  *
6692  * In order to shutdown the zone, we will hand off control to zoneadmd
6693  * (running in the global zone) via a door.  We do a half-hearted job at
6694  * killing all processes in the zone, create a kernel thread to contact
6695  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6696  * a form of generation number used to let zoneadmd (as well as
6697  * zone_destroy()) know exactly which zone they're re talking about.
6698  */
6699 int
6700 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6701 {
6702         struct zarg *zargp;
6703         zone_cmd_t zcmd;
6704         zone_t *zone;
6705 
6706         zone = curproc->p_zone;
6707         ASSERT(getzoneid() != GLOBAL_ZONEID);
6708 
6709         switch (cmd) {
6710         case A_SHUTDOWN:
6711                 switch (fcn) {
6712                 case AD_HALT:
6713                 case AD_POWEROFF:
6714                         zcmd = Z_HALT;
6715                         break;
6716                 case AD_BOOT:
6717                         zcmd = Z_REBOOT;
6718                         break;
6719                 case AD_IBOOT:
6720                 case AD_SBOOT:
6721                 case AD_SIBOOT:
6722                 case AD_NOSYNC:
6723                         return (ENOTSUP);
6724                 default:
6725                         return (EINVAL);
6726                 }
6727                 break;
6728         case A_REBOOT:
6729                 zcmd = Z_REBOOT;
6730                 break;
6731         case A_FTRACE:
6732         case A_REMOUNT:
6733         case A_FREEZE:
6734         case A_DUMP:
6735         case A_CONFIG:
6736                 return (ENOTSUP);
6737         default:
6738                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6739                 return (EINVAL);
6740         }
6741 
6742         if (secpolicy_zone_admin(credp, B_FALSE))
6743                 return (EPERM);
6744         mutex_enter(&zone_status_lock);
6745 
6746         /*
6747          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6748          * is in the zone.
6749          */
6750         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6751         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6752                 /*
6753                  * This zone is already on its way down.
6754                  */
6755                 mutex_exit(&zone_status_lock);
6756                 return (0);
6757         }
6758         /*
6759          * Prevent future zone_enter()s
6760          */
6761         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6762         mutex_exit(&zone_status_lock);
6763 
6764         /*
6765          * Kill everyone now and call zoneadmd later.
6766          * zone_ki_call_zoneadmd() will do a more thorough job of this
6767          * later.
6768          */
6769         killall(zone->zone_id);
6770         /*
6771          * Now, create the thread to contact zoneadmd and do the rest of the
6772          * work.  This thread can't be created in our zone otherwise
6773          * zone_destroy() would deadlock.
6774          */
6775         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6776         zargp->arg.cmd = zcmd;
6777         zargp->arg.uniqid = zone->zone_uniqid;
6778         zargp->zone = zone;
6779         (void) strcpy(zargp->arg.locale, "C");
6780         /* mdep was already copied in for us by uadmin */
6781         if (mdep != NULL)
6782                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6783                     sizeof (zargp->arg.bootbuf));
6784         zone_hold(zone);
6785 
6786         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6787             TS_RUN, minclsyspri);
6788         exit(CLD_EXITED, 0);
6789 
6790         return (EINVAL);
6791 }
6792 
6793 /*
6794  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6795  * status to ZONE_IS_SHUTTING_DOWN.
6796  *
6797  * This function also shuts down all running zones to ensure that they won't
6798  * fork new processes.
6799  */
6800 void
6801 zone_shutdown_global(void)
6802 {
6803         zone_t *current_zonep;
6804 
6805         ASSERT(INGLOBALZONE(curproc));
6806         mutex_enter(&zonehash_lock);
6807         mutex_enter(&zone_status_lock);
6808 
6809         /* Modify the global zone's status first. */
6810         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6811         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6812 
6813         /*
6814          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6815          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6816          * could cause assertions to fail (e.g., assertions about a zone's
6817          * state during initialization, readying, or booting) or produce races.
6818          * We'll let threads continue to initialize and ready new zones: they'll
6819          * fail to boot the new zones when they see that the global zone is
6820          * shutting down.
6821          */
6822         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6823             current_zonep = list_next(&zone_active, current_zonep)) {
6824                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6825                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6826         }
6827         mutex_exit(&zone_status_lock);
6828         mutex_exit(&zonehash_lock);
6829 }
6830 
6831 /*
6832  * Returns true if the named dataset is visible in the current zone.
6833  * The 'write' parameter is set to 1 if the dataset is also writable.
6834  */
6835 int
6836 zone_dataset_visible(const char *dataset, int *write)
6837 {
6838         static int zfstype = -1;
6839         zone_dataset_t *zd;
6840         size_t len;
6841         zone_t *zone = curproc->p_zone;
6842         const char *name = NULL;
6843         vfs_t *vfsp = NULL;
6844 
6845         if (dataset[0] == '\0')
6846                 return (0);
6847 
6848         /*
6849          * Walk the list once, looking for datasets which match exactly, or
6850          * specify a dataset underneath an exported dataset.  If found, return
6851          * true and note that it is writable.
6852          */
6853         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6854             zd = list_next(&zone->zone_datasets, zd)) {
6855 
6856                 len = strlen(zd->zd_dataset);
6857                 if (strlen(dataset) >= len &&
6858                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6859                     (dataset[len] == '\0' || dataset[len] == '/' ||
6860                     dataset[len] == '@')) {
6861                         if (write)
6862                                 *write = 1;
6863                         return (1);
6864                 }
6865         }
6866 
6867         /*
6868          * Walk the list a second time, searching for datasets which are parents
6869          * of exported datasets.  These should be visible, but read-only.
6870          *
6871          * Note that we also have to support forms such as 'pool/dataset/', with
6872          * a trailing slash.
6873          */
6874         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6875             zd = list_next(&zone->zone_datasets, zd)) {
6876 
6877                 len = strlen(dataset);
6878                 if (dataset[len - 1] == '/')
6879                         len--;  /* Ignore trailing slash */
6880                 if (len < strlen(zd->zd_dataset) &&
6881                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6882                     zd->zd_dataset[len] == '/') {
6883                         if (write)
6884                                 *write = 0;
6885                         return (1);
6886                 }
6887         }
6888 
6889         /*
6890          * We reach here if the given dataset is not found in the zone_dataset
6891          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6892          * instead of delegation. For this we search for the dataset in the
6893          * zone_vfslist of this zone. If found, return true and note that it is
6894          * not writable.
6895          */
6896 
6897         /*
6898          * Initialize zfstype if it is not initialized yet.
6899          */
6900         if (zfstype == -1) {
6901                 struct vfssw *vswp = vfs_getvfssw("zfs");
6902                 zfstype = vswp - vfssw;
6903                 vfs_unrefvfssw(vswp);
6904         }
6905 
6906         vfs_list_read_lock();
6907         vfsp = zone->zone_vfslist;
6908         do {
6909                 ASSERT(vfsp);
6910                 if (vfsp->vfs_fstype == zfstype) {
6911                         name = refstr_value(vfsp->vfs_resource);
6912 
6913                         /*
6914                          * Check if we have an exact match.
6915                          */
6916                         if (strcmp(dataset, name) == 0) {
6917                                 vfs_list_unlock();
6918                                 if (write)
6919                                         *write = 0;
6920                                 return (1);
6921                         }
6922                         /*
6923                          * We need to check if we are looking for parents of
6924                          * a dataset. These should be visible, but read-only.
6925                          */
6926                         len = strlen(dataset);
6927                         if (dataset[len - 1] == '/')
6928                                 len--;
6929 
6930                         if (len < strlen(name) &&
6931                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6932                                 vfs_list_unlock();
6933                                 if (write)
6934                                         *write = 0;
6935                                 return (1);
6936                         }
6937                 }
6938                 vfsp = vfsp->vfs_zone_next;
6939         } while (vfsp != zone->zone_vfslist);
6940 
6941         vfs_list_unlock();
6942         return (0);
6943 }
6944 
6945 /*
6946  * zone_find_by_any_path() -
6947  *
6948  * kernel-private routine similar to zone_find_by_path(), but which
6949  * effectively compares against zone paths rather than zonerootpath
6950  * (i.e., the last component of zonerootpaths, which should be "root/",
6951  * are not compared.)  This is done in order to accurately identify all
6952  * paths, whether zone-visible or not, including those which are parallel
6953  * to /root/, such as /dev/, /home/, etc...
6954  *
6955  * If the specified path does not fall under any zone path then global
6956  * zone is returned.
6957  *
6958  * The treat_abs parameter indicates whether the path should be treated as
6959  * an absolute path although it does not begin with "/".  (This supports
6960  * nfs mount syntax such as host:any/path.)
6961  *
6962  * The caller is responsible for zone_rele of the returned zone.
6963  */
6964 zone_t *
6965 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6966 {
6967         zone_t *zone;
6968         int path_offset = 0;
6969 
6970         if (path == NULL) {
6971                 zone_hold(global_zone);
6972                 return (global_zone);
6973         }
6974 
6975         if (*path != '/') {
6976                 ASSERT(treat_abs);
6977                 path_offset = 1;
6978         }
6979 
6980         mutex_enter(&zonehash_lock);
6981         for (zone = list_head(&zone_active); zone != NULL;
6982             zone = list_next(&zone_active, zone)) {
6983                 char    *c;
6984                 size_t  pathlen;
6985                 char *rootpath_start;
6986 
6987                 if (zone == global_zone)        /* skip global zone */
6988                         continue;
6989 
6990                 /* scan backwards to find start of last component */
6991                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6992                 do {
6993                         c--;
6994                 } while (*c != '/');
6995 
6996                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6997                 rootpath_start = (zone->zone_rootpath + path_offset);
6998                 if (strncmp(path, rootpath_start, pathlen) == 0)
6999                         break;
7000         }
7001         if (zone == NULL)
7002                 zone = global_zone;
7003         zone_hold(zone);
7004         mutex_exit(&zonehash_lock);
7005         return (zone);
7006 }
7007 
7008 /*
7009  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7010  * zone_dl_t pointer if found, and NULL otherwise.
7011  */
7012 static zone_dl_t *
7013 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7014 {
7015         zone_dl_t *zdl;
7016 
7017         ASSERT(mutex_owned(&zone->zone_lock));
7018         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7019             zdl = list_next(&zone->zone_dl_list, zdl)) {
7020                 if (zdl->zdl_id == linkid)
7021                         break;
7022         }
7023         return (zdl);
7024 }
7025 
7026 static boolean_t
7027 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7028 {
7029         boolean_t exists;
7030 
7031         mutex_enter(&zone->zone_lock);
7032         exists = (zone_find_dl(zone, linkid) != NULL);
7033         mutex_exit(&zone->zone_lock);
7034         return (exists);
7035 }
7036 
7037 /*
7038  * Add an data link name for the zone.
7039  */
7040 static int
7041 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7042 {
7043         zone_dl_t *zdl;
7044         zone_t *zone;
7045         zone_t *thiszone;
7046 
7047         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7048                 return (set_errno(ENXIO));
7049 
7050         /* Verify that the datalink ID doesn't already belong to a zone. */
7051         mutex_enter(&zonehash_lock);
7052         for (zone = list_head(&zone_active); zone != NULL;
7053             zone = list_next(&zone_active, zone)) {
7054                 if (zone_dl_exists(zone, linkid)) {
7055                         mutex_exit(&zonehash_lock);
7056                         zone_rele(thiszone);
7057                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7058                 }
7059         }
7060 
7061         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7062         zdl->zdl_id = linkid;
7063         zdl->zdl_net = NULL;
7064         mutex_enter(&thiszone->zone_lock);
7065         list_insert_head(&thiszone->zone_dl_list, zdl);
7066         mutex_exit(&thiszone->zone_lock);
7067         mutex_exit(&zonehash_lock);
7068         zone_rele(thiszone);
7069         return (0);
7070 }
7071 
7072 static int
7073 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7074 {
7075         zone_dl_t *zdl;
7076         zone_t *zone;
7077         int err = 0;
7078 
7079         if ((zone = zone_find_by_id(zoneid)) == NULL)
7080                 return (set_errno(EINVAL));
7081 
7082         mutex_enter(&zone->zone_lock);
7083         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7084                 err = ENXIO;
7085         } else {
7086                 list_remove(&zone->zone_dl_list, zdl);
7087                 nvlist_free(zdl->zdl_net);
7088                 kmem_free(zdl, sizeof (zone_dl_t));
7089         }
7090         mutex_exit(&zone->zone_lock);
7091         zone_rele(zone);
7092         return (err == 0 ? 0 : set_errno(err));
7093 }
7094 
7095 /*
7096  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7097  * the linkid.  Otherwise we just check if the specified zoneidp has been
7098  * assigned the supplied linkid.
7099  */
7100 int
7101 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7102 {
7103         zone_t *zone;
7104         int err = ENXIO;
7105 
7106         if (*zoneidp != ALL_ZONES) {
7107                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7108                         if (zone_dl_exists(zone, linkid))
7109                                 err = 0;
7110                         zone_rele(zone);
7111                 }
7112                 return (err);
7113         }
7114 
7115         mutex_enter(&zonehash_lock);
7116         for (zone = list_head(&zone_active); zone != NULL;
7117             zone = list_next(&zone_active, zone)) {
7118                 if (zone_dl_exists(zone, linkid)) {
7119                         *zoneidp = zone->zone_id;
7120                         err = 0;
7121                         break;
7122                 }
7123         }
7124         mutex_exit(&zonehash_lock);
7125         return (err);
7126 }
7127 
7128 /*
7129  * Get the list of datalink IDs assigned to a zone.
7130  *
7131  * On input, *nump is the number of datalink IDs that can fit in the supplied
7132  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7133  * that were placed in the array if the array was large enough, or to the
7134  * number of datalink IDs that the function needs to place in the array if the
7135  * array is too small.
7136  */
7137 static int
7138 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7139 {
7140         uint_t num, dlcount;
7141         zone_t *zone;
7142         zone_dl_t *zdl;
7143         datalink_id_t *idptr = idarray;
7144 
7145         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7146                 return (set_errno(EFAULT));
7147         if ((zone = zone_find_by_id(zoneid)) == NULL)
7148                 return (set_errno(ENXIO));
7149 
7150         num = 0;
7151         mutex_enter(&zone->zone_lock);
7152         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7153             zdl = list_next(&zone->zone_dl_list, zdl)) {
7154                 /*
7155                  * If the list is bigger than what the caller supplied, just
7156                  * count, don't do copyout.
7157                  */
7158                 if (++num > dlcount)
7159                         continue;
7160                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7161                         mutex_exit(&zone->zone_lock);
7162                         zone_rele(zone);
7163                         return (set_errno(EFAULT));
7164                 }
7165                 idptr++;
7166         }
7167         mutex_exit(&zone->zone_lock);
7168         zone_rele(zone);
7169 
7170         /* Increased or decreased, caller should be notified. */
7171         if (num != dlcount) {
7172                 if (copyout(&num, nump, sizeof (num)) != 0)
7173                         return (set_errno(EFAULT));
7174         }
7175         return (0);
7176 }
7177 
7178 /*
7179  * Public interface for looking up a zone by zoneid. It's a customized version
7180  * for netstack_zone_create(). It can only be called from the zsd create
7181  * callbacks, since it doesn't have reference on the zone structure hence if
7182  * it is called elsewhere the zone could disappear after the zonehash_lock
7183  * is dropped.
7184  *
7185  * Furthermore it
7186  * 1. Doesn't check the status of the zone.
7187  * 2. It will be called even before zone_init is called, in that case the
7188  *    address of zone0 is returned directly, and netstack_zone_create()
7189  *    will only assign a value to zone0.zone_netstack, won't break anything.
7190  * 3. Returns without the zone being held.
7191  */
7192 zone_t *
7193 zone_find_by_id_nolock(zoneid_t zoneid)
7194 {
7195         zone_t *zone;
7196 
7197         mutex_enter(&zonehash_lock);
7198         if (zonehashbyid == NULL)
7199                 zone = &zone0;
7200         else
7201                 zone = zone_find_all_by_id(zoneid);
7202         mutex_exit(&zonehash_lock);
7203         return (zone);
7204 }
7205 
7206 /*
7207  * Walk the datalinks for a given zone
7208  */
7209 int
7210 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7211     void *data)
7212 {
7213         zone_t          *zone;
7214         zone_dl_t       *zdl;
7215         datalink_id_t   *idarray;
7216         uint_t          idcount = 0;
7217         int             i, ret = 0;
7218 
7219         if ((zone = zone_find_by_id(zoneid)) == NULL)
7220                 return (ENOENT);
7221 
7222         /*
7223          * We first build an array of linkid's so that we can walk these and
7224          * execute the callback with the zone_lock dropped.
7225          */
7226         mutex_enter(&zone->zone_lock);
7227         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7228             zdl = list_next(&zone->zone_dl_list, zdl)) {
7229                 idcount++;
7230         }
7231 
7232         if (idcount == 0) {
7233                 mutex_exit(&zone->zone_lock);
7234                 zone_rele(zone);
7235                 return (0);
7236         }
7237 
7238         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7239         if (idarray == NULL) {
7240                 mutex_exit(&zone->zone_lock);
7241                 zone_rele(zone);
7242                 return (ENOMEM);
7243         }
7244 
7245         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7246             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7247                 idarray[i] = zdl->zdl_id;
7248         }
7249 
7250         mutex_exit(&zone->zone_lock);
7251 
7252         for (i = 0; i < idcount && ret == 0; i++) {
7253                 if ((ret = (*cb)(idarray[i], data)) != 0)
7254                         break;
7255         }
7256 
7257         zone_rele(zone);
7258         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7259         return (ret);
7260 }
7261 
7262 static char *
7263 zone_net_type2name(int type)
7264 {
7265         switch (type) {
7266         case ZONE_NETWORK_ADDRESS:
7267                 return (ZONE_NET_ADDRNAME);
7268         case ZONE_NETWORK_DEFROUTER:
7269                 return (ZONE_NET_RTRNAME);
7270         default:
7271                 return (NULL);
7272         }
7273 }
7274 
7275 static int
7276 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7277 {
7278         zone_t *zone;
7279         zone_dl_t *zdl;
7280         nvlist_t *nvl;
7281         int err = 0;
7282         uint8_t *new = NULL;
7283         char *nvname;
7284         int bufsize;
7285         datalink_id_t linkid = znbuf->zn_linkid;
7286 
7287         if (secpolicy_zone_config(CRED()) != 0)
7288                 return (set_errno(EPERM));
7289 
7290         if (zoneid == GLOBAL_ZONEID)
7291                 return (set_errno(EINVAL));
7292 
7293         nvname = zone_net_type2name(znbuf->zn_type);
7294         bufsize = znbuf->zn_len;
7295         new = znbuf->zn_val;
7296         if (nvname == NULL)
7297                 return (set_errno(EINVAL));
7298 
7299         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7300                 return (set_errno(EINVAL));
7301         }
7302 
7303         mutex_enter(&zone->zone_lock);
7304         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7305                 err = ENXIO;
7306                 goto done;
7307         }
7308         if ((nvl = zdl->zdl_net) == NULL) {
7309                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7310                         err = ENOMEM;
7311                         goto done;
7312                 } else {
7313                         zdl->zdl_net = nvl;
7314                 }
7315         }
7316         if (nvlist_exists(nvl, nvname)) {
7317                 err = EINVAL;
7318                 goto done;
7319         }
7320         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7321         ASSERT(err == 0);
7322 done:
7323         mutex_exit(&zone->zone_lock);
7324         zone_rele(zone);
7325         if (err != 0)
7326                 return (set_errno(err));
7327         else
7328                 return (0);
7329 }
7330 
7331 static int
7332 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7333 {
7334         zone_t *zone;
7335         zone_dl_t *zdl;
7336         nvlist_t *nvl;
7337         uint8_t *ptr;
7338         uint_t psize;
7339         int err = 0;
7340         char *nvname;
7341         int bufsize;
7342         void *buf;
7343         datalink_id_t linkid = znbuf->zn_linkid;
7344 
7345         if (zoneid == GLOBAL_ZONEID)
7346                 return (set_errno(EINVAL));
7347 
7348         nvname = zone_net_type2name(znbuf->zn_type);
7349         bufsize = znbuf->zn_len;
7350         buf = znbuf->zn_val;
7351 
7352         if (nvname == NULL)
7353                 return (set_errno(EINVAL));
7354         if ((zone = zone_find_by_id(zoneid)) == NULL)
7355                 return (set_errno(EINVAL));
7356 
7357         mutex_enter(&zone->zone_lock);
7358         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7359                 err = ENXIO;
7360                 goto done;
7361         }
7362         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7363                 err = ENOENT;
7364                 goto done;
7365         }
7366         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7367         ASSERT(err == 0);
7368 
7369         if (psize > bufsize) {
7370                 err = ENOBUFS;
7371                 goto done;
7372         }
7373         znbuf->zn_len = psize;
7374         bcopy(ptr, buf, psize);
7375 done:
7376         mutex_exit(&zone->zone_lock);
7377         zone_rele(zone);
7378         if (err != 0)
7379                 return (set_errno(err));
7380         else
7381                 return (0);
7382 }