2 Old usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 /*
  30  * Zones
  31  *
  32  *   A zone is a named collection of processes, namespace constraints,
  33  *   and other system resources which comprise a secure and manageable
  34  *   application containment facility.
  35  *
  36  *   Zones (represented by the reference counted zone_t) are tracked in
  37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38  *   (zoneid_t) are used to track zone association.  Zone IDs are
  39  *   dynamically generated when the zone is created; if a persistent
  40  *   identifier is needed (core files, accounting logs, audit trail,
  41  *   etc.), the zone name should be used.
  42  *
  43  *
  44  *   Global Zone:
  45  *
  46  *   The global zone (zoneid 0) is automatically associated with all
  47  *   system resources that have not been bound to a user-created zone.
  48  *   This means that even systems where zones are not in active use
  49  *   have a global zone, and all processes, mounts, etc. are
  50  *   associated with that zone.  The global zone is generally
  51  *   unconstrained in terms of privileges and access, though the usual
  52  *   credential and privilege based restrictions apply.
  53  *
  54  *
  55  *   Zone States:
  56  *
  57  *   The states in which a zone may be in and the transitions are as
  58  *   follows:
  59  *
  60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61  *   initialized zone is added to the list of active zones on the system but
  62  *   isn't accessible.
  63  *
  64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65  *   not yet completed. Not possible to enter the zone, but attributes can
  66  *   be retrieved.
  67  *
  68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70  *   executed.  A zone remains in this state until it transitions into
  71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72  *
  73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75  *   state.
  76  *
  77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78  *   successfully started init.   A zone remains in this state until
  79  *   zone_shutdown() is called.
  80  *
  81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82  *   killing all processes running in the zone. The zone remains
  83  *   in this state until there are no more user processes running in the zone.
  84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85  *   Since zone_shutdown() is restartable, it may be called successfully
  86  *   multiple times for the same zone_t.  Setting of the zone's state to
  87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88  *   the zone's status without worrying about it being a moving target.
  89  *
  90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91  *   are no more user processes in the zone.  The zone remains in this
  92  *   state until there are no more kernel threads associated with the
  93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94  *   fail.
  95  *
  96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98  *   join the zone or create kernel threads therein.
  99  *
 100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  *   return NULL from now on.
 103  *
 104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  *   processes or threads doing work on behalf of the zone.  The zone is
 106  *   removed from the list of active zones.  zone_destroy() returns, and
 107  *   the zone can be recreated.
 108  *
 109  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  *   callbacks are executed, and all memory associated with the zone is
 111  *   freed.
 112  *
 113  *   Threads can wait for the zone to enter a requested state by using
 114  *   zone_status_wait() or zone_status_timedwait() with the desired
 115  *   state passed in as an argument.  Zone state transitions are
 116  *   uni-directional; it is not possible to move back to an earlier state.
 117  *
 118  *
 119  *   Zone-Specific Data:
 120  *
 121  *   Subsystems needing to maintain zone-specific data can store that
 122  *   data using the ZSD mechanism.  This provides a zone-specific data
 123  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  *   to register callbacks to be invoked when a zone is created, shut
 126  *   down, or destroyed.  This can be used to initialize zone-specific
 127  *   data for new zones and to clean up when zones go away.
 128  *
 129  *
 130  *   Data Structures:
 131  *
 132  *   The per-zone structure (zone_t) is reference counted, and freed
 133  *   when all references are released.  zone_hold and zone_rele can be
 134  *   used to adjust the reference count.  In addition, reference counts
 135  *   associated with the cred_t structure are tracked separately using
 136  *   zone_cred_hold and zone_cred_rele.
 137  *
 138  *   Pointers to active zone_t's are stored in two hash tables; one
 139  *   for searching by id, the other for searching by name.  Lookups
 140  *   can be performed on either basis, using zone_find_by_id and
 141  *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  *   held, so zone_rele should be called when the pointer is no longer
 143  *   needed.  Zones can also be searched by path; zone_find_by_path
 144  *   returns the zone with which a path name is associated (global
 145  *   zone if the path is not within some other zone's file system
 146  *   hierarchy).  This currently requires iterating through each zone,
 147  *   so it is slower than an id or name search via a hash table.
 148  *
 149  *
 150  *   Locking:
 151  *
 152  *   zonehash_lock: This is a top-level global lock used to protect the
 153  *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  *       while this lock is held.
 155  *   zone_status_lock: This is a global lock protecting zone state.
 156  *       Zones cannot change state while this lock is held.  It also
 157  *       protects the list of kernel threads associated with a zone.
 158  *   zone_lock: This is a per-zone lock used to protect several fields of
 159  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  *       this lock means that the zone cannot go away.
 161  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-lwps rctl.
 163  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  *       currently just max_lofi
 167  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  *       list (a list of zones in the ZONE_IS_DEAD state).
 170  *
 171  *   Ordering requirements:
 172  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  *
 175  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  *
 179  *   Blocking memory allocations are permitted while holding any of the
 180  *   zone locks.
 181  *
 182  *
 183  *   System Call Interface:
 184  *
 185  *   The zone subsystem can be managed and queried from user level with
 186  *   the following system calls (all subcodes of the primary "zone"
 187  *   system call):
 188  *   - zone_create: creates a zone with selected attributes (name,
 189  *     root path, privileges, resource controls, ZFS datasets)
 190  *   - zone_enter: allows the current process to enter a zone
 191  *   - zone_getattr: reports attributes of a zone
 192  *   - zone_setattr: set attributes of a zone
 193  *   - zone_boot: set 'init' running for the zone
 194  *   - zone_list: lists all zones active in the system
 195  *   - zone_lookup: looks up zone id based on name
 196  *   - zone_shutdown: initiates shutdown process (see states above)
 197  *   - zone_destroy: completes shutdown process (see states above)
 198  *
 199  */
 200 
 201 #include <sys/priv_impl.h>
 202 #include <sys/cred.h>
 203 #include <c2/audit.h>
 204 #include <sys/debug.h>
 205 #include <sys/file.h>
 206 #include <sys/kmem.h>
 207 #include <sys/kstat.h>
 208 #include <sys/mutex.h>
 209 #include <sys/note.h>
 210 #include <sys/pathname.h>
 211 #include <sys/proc.h>
 212 #include <sys/project.h>
 213 #include <sys/sysevent.h>
 214 #include <sys/task.h>
 215 #include <sys/systm.h>
 216 #include <sys/types.h>
 217 #include <sys/utsname.h>
 218 #include <sys/vnode.h>
 219 #include <sys/vfs.h>
 220 #include <sys/systeminfo.h>
 221 #include <sys/policy.h>
 222 #include <sys/cred_impl.h>
 223 #include <sys/contract_impl.h>
 224 #include <sys/contract/process_impl.h>
 225 #include <sys/class.h>
 226 #include <sys/pool.h>
 227 #include <sys/pool_pset.h>
 228 #include <sys/pset.h>
 229 #include <sys/strlog.h>
 230 #include <sys/sysmacros.h>
 231 #include <sys/callb.h>
 232 #include <sys/vmparam.h>
 233 #include <sys/corectl.h>
 234 #include <sys/ipc_impl.h>
 235 #include <sys/klpd.h>
 236 
 237 #include <sys/door.h>
 238 #include <sys/cpuvar.h>
 239 #include <sys/sdt.h>
 240 
 241 #include <sys/uadmin.h>
 242 #include <sys/session.h>
 243 #include <sys/cmn_err.h>
 244 #include <sys/modhash.h>
 245 #include <sys/sunddi.h>
 246 #include <sys/nvpair.h>
 247 #include <sys/rctl.h>
 248 #include <sys/fss.h>
 249 #include <sys/brand.h>
 250 #include <sys/zone.h>
 251 #include <net/if.h>
 252 #include <sys/cpucaps.h>
 253 #include <vm/seg.h>
 254 #include <sys/mac.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285 
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298 
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304 
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316 
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321 
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329 
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332 
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335 
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354 
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_max_lofi;
 376 rctl_hndl_t rc_zone_cpu_cap;
 377 rctl_hndl_t rc_zone_nlwps;
 378 rctl_hndl_t rc_zone_nprocs;
 379 rctl_hndl_t rc_zone_shmmax;
 380 rctl_hndl_t rc_zone_shmmni;
 381 rctl_hndl_t rc_zone_semmni;
 382 rctl_hndl_t rc_zone_msgmni;
 383 
 384 const char * const zone_default_initname = "/sbin/init";
 385 static char * const zone_prefix = "/zone/";
 386 static int zone_shutdown(zoneid_t zoneid);
 387 static int zone_add_datalink(zoneid_t, datalink_id_t);
 388 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390 static int zone_set_network(zoneid_t, zone_net_data_t *);
 391 static int zone_get_network(zoneid_t, zone_net_data_t *);
 392 
 393 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394 
 395 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399     zone_key_t);
 400 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402     kmutex_t *);
 403 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404     kmutex_t *);
 405 
 406 /*
 407  * Bump this number when you alter the zone syscall interfaces; this is
 408  * because we need to have support for previous API versions in libc
 409  * to support patching; libc calls into the kernel to determine this number.
 410  *
 411  * Version 1 of the API is the version originally shipped with Solaris 10
 412  * Version 2 alters the zone_create system call in order to support more
 413  *     arguments by moving the args into a structure; and to do better
 414  *     error reporting when zone_create() fails.
 415  * Version 3 alters the zone_create system call in order to support the
 416  *     import of ZFS datasets to zones.
 417  * Version 4 alters the zone_create system call in order to support
 418  *     Trusted Extensions.
 419  * Version 5 alters the zone_boot system call, and converts its old
 420  *     bootargs parameter to be set by the zone_setattr API instead.
 421  * Version 6 adds the flag argument to zone_create.
 422  */
 423 static const int ZONE_SYSCALL_API_VERSION = 6;
 424 
 425 /*
 426  * Certain filesystems (such as NFS and autofs) need to know which zone
 427  * the mount is being placed in.  Because of this, we need to be able to
 428  * ensure that a zone isn't in the process of being created/destroyed such
 429  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  * mount list. Since a zone can't reside on an NFS file system, we don't
 432  * have to worry about the zonepath itself.
 433  *
 434  * The following functions: block_mounts()/resume_mounts() and
 435  * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  * layer (respectively) to synchronize zone state transitions and new
 437  * mounts within a zone. This syncronization is on a per-zone basis, so
 438  * activity for one zone will not interfere with activity for another zone.
 439  *
 440  * The semantics are like a reader-reader lock such that there may
 441  * either be multiple mounts (or zone state transitions, if that weren't
 442  * serialized by zonehash_lock) in progress at the same time, but not
 443  * both.
 444  *
 445  * We use cv's so the user can ctrl-C out of the operation if it's
 446  * taking too long.
 447  *
 448  * The semantics are such that there is unfair bias towards the
 449  * "current" operation.  This means that zone halt may starve if
 450  * there is a rapid succession of new mounts coming in to the zone.
 451  */
 452 /*
 453  * Prevent new mounts from progressing to the point of calling
 454  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  * them to complete.
 456  */
 457 static int
 458 block_mounts(zone_t *zp)
 459 {
 460         int retval = 0;
 461 
 462         /*
 463          * Since it may block for a long time, block_mounts() shouldn't be
 464          * called with zonehash_lock held.
 465          */
 466         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467         mutex_enter(&zp->zone_mount_lock);
 468         while (zp->zone_mounts_in_progress > 0) {
 469                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470                         goto signaled;
 471         }
 472         /*
 473          * A negative value of mounts_in_progress indicates that mounts
 474          * have been blocked by (-mounts_in_progress) different callers
 475          * (remotely possible if two threads enter zone_shutdown at the same
 476          * time).
 477          */
 478         zp->zone_mounts_in_progress--;
 479         retval = 1;
 480 signaled:
 481         mutex_exit(&zp->zone_mount_lock);
 482         return (retval);
 483 }
 484 
 485 /*
 486  * The VFS layer may progress with new mounts as far as we're concerned.
 487  * Allow them to progress if we were the last obstacle.
 488  */
 489 static void
 490 resume_mounts(zone_t *zp)
 491 {
 492         mutex_enter(&zp->zone_mount_lock);
 493         if (++zp->zone_mounts_in_progress == 0)
 494                 cv_broadcast(&zp->zone_mount_cv);
 495         mutex_exit(&zp->zone_mount_lock);
 496 }
 497 
 498 /*
 499  * The VFS layer is busy with a mount; this zone should wait until all
 500  * of its mounts are completed to progress.
 501  */
 502 void
 503 mount_in_progress(zone_t *zp)
 504 {
 505         mutex_enter(&zp->zone_mount_lock);
 506         while (zp->zone_mounts_in_progress < 0)
 507                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508         zp->zone_mounts_in_progress++;
 509         mutex_exit(&zp->zone_mount_lock);
 510 }
 511 
 512 /*
 513  * VFS is done with one mount; wake up any waiting block_mounts()
 514  * callers if this is the last mount.
 515  */
 516 void
 517 mount_completed(zone_t *zp)
 518 {
 519         mutex_enter(&zp->zone_mount_lock);
 520         if (--zp->zone_mounts_in_progress == 0)
 521                 cv_broadcast(&zp->zone_mount_cv);
 522         mutex_exit(&zp->zone_mount_lock);
 523 }
 524 
 525 /*
 526  * ZSD routines.
 527  *
 528  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  * defined by the pthread_key_create() and related interfaces.
 530  *
 531  * Kernel subsystems may register one or more data items and/or
 532  * callbacks to be executed when a zone is created, shutdown, or
 533  * destroyed.
 534  *
 535  * Unlike the thread counterpart, destructor callbacks will be executed
 536  * even if the data pointer is NULL and/or there are no constructor
 537  * callbacks, so it is the responsibility of such callbacks to check for
 538  * NULL data values if necessary.
 539  *
 540  * The locking strategy and overall picture is as follows:
 541  *
 542  * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  * holding that lock all the existing zones are marked as
 545  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  * zone_zsd list (protected by zone_lock). The global list is updated first
 547  * (under zone_key_lock) to make sure that newly created zones use the
 548  * most recent list of keys. Then under zonehash_lock we walk the zones
 549  * and mark them.  Similar locking is used in zone_key_delete().
 550  *
 551  * The actual create, shutdown, and destroy callbacks are done without
 552  * holding any lock. And zsd_flags are used to ensure that the operations
 553  * completed so that when zone_key_create (and zone_create) is done, as well as
 554  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  * are completed.
 556  *
 557  * When new zones are created constructor callbacks for all registered ZSD
 558  * entries will be called. That also uses the above two phases of marking
 559  * what needs to be done, and then running the callbacks without holding
 560  * any locks.
 561  *
 562  * The framework does not provide any locking around zone_getspecific() and
 563  * zone_setspecific() apart from that needed for internal consistency, so
 564  * callers interested in atomic "test-and-set" semantics will need to provide
 565  * their own locking.
 566  */
 567 
 568 /*
 569  * Helper function to find the zsd_entry associated with the key in the
 570  * given list.
 571  */
 572 static struct zsd_entry *
 573 zsd_find(list_t *l, zone_key_t key)
 574 {
 575         struct zsd_entry *zsd;
 576 
 577         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578                 if (zsd->zsd_key == key) {
 579                         return (zsd);
 580                 }
 581         }
 582         return (NULL);
 583 }
 584 
 585 /*
 586  * Helper function to find the zsd_entry associated with the key in the
 587  * given list. Move it to the front of the list.
 588  */
 589 static struct zsd_entry *
 590 zsd_find_mru(list_t *l, zone_key_t key)
 591 {
 592         struct zsd_entry *zsd;
 593 
 594         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595                 if (zsd->zsd_key == key) {
 596                         /*
 597                          * Move to head of list to keep list in MRU order.
 598                          */
 599                         if (zsd != list_head(l)) {
 600                                 list_remove(l, zsd);
 601                                 list_insert_head(l, zsd);
 602                         }
 603                         return (zsd);
 604                 }
 605         }
 606         return (NULL);
 607 }
 608 
 609 void
 610 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612 {
 613         struct zsd_entry *zsdp;
 614         struct zsd_entry *t;
 615         struct zone *zone;
 616         zone_key_t  key;
 617 
 618         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619         zsdp->zsd_data = NULL;
 620         zsdp->zsd_create = create;
 621         zsdp->zsd_shutdown = shutdown;
 622         zsdp->zsd_destroy = destroy;
 623 
 624         /*
 625          * Insert in global list of callbacks. Makes future zone creations
 626          * see it.
 627          */
 628         mutex_enter(&zsd_key_lock);
 629         key = zsdp->zsd_key = ++zsd_keyval;
 630         ASSERT(zsd_keyval != 0);
 631         list_insert_tail(&zsd_registered_keys, zsdp);
 632         mutex_exit(&zsd_key_lock);
 633 
 634         /*
 635          * Insert for all existing zones and mark them as needing
 636          * a create callback.
 637          */
 638         mutex_enter(&zonehash_lock);        /* stop the world */
 639         for (zone = list_head(&zone_active); zone != NULL;
 640             zone = list_next(&zone_active, zone)) {
 641                 zone_status_t status;
 642 
 643                 mutex_enter(&zone->zone_lock);
 644 
 645                 /* Skip zones that are on the way down or not yet up */
 646                 status = zone_status_get(zone);
 647                 if (status >= ZONE_IS_DOWN ||
 648                     status == ZONE_IS_UNINITIALIZED) {
 649                         mutex_exit(&zone->zone_lock);
 650                         continue;
 651                 }
 652 
 653                 t = zsd_find_mru(&zone->zone_zsd, key);
 654                 if (t != NULL) {
 655                         /*
 656                          * A zsd_configure already inserted it after
 657                          * we dropped zsd_key_lock above.
 658                          */
 659                         mutex_exit(&zone->zone_lock);
 660                         continue;
 661                 }
 662                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663                 t->zsd_key = key;
 664                 t->zsd_create = create;
 665                 t->zsd_shutdown = shutdown;
 666                 t->zsd_destroy = destroy;
 667                 if (create != NULL) {
 668                         t->zsd_flags = ZSD_CREATE_NEEDED;
 669                         DTRACE_PROBE2(zsd__create__needed,
 670                             zone_t *, zone, zone_key_t, key);
 671                 }
 672                 list_insert_tail(&zone->zone_zsd, t);
 673                 mutex_exit(&zone->zone_lock);
 674         }
 675         mutex_exit(&zonehash_lock);
 676 
 677         if (create != NULL) {
 678                 /* Now call the create callback for this key */
 679                 zsd_apply_all_zones(zsd_apply_create, key);
 680         }
 681         /*
 682          * It is safe for consumers to use the key now, make it
 683          * globally visible. Specifically zone_getspecific() will
 684          * always successfully return the zone specific data associated
 685          * with the key.
 686          */
 687         *keyp = key;
 688 
 689 }
 690 
 691 /*
 692  * Function called when a module is being unloaded, or otherwise wishes
 693  * to unregister its ZSD key and callbacks.
 694  *
 695  * Remove from the global list and determine the functions that need to
 696  * be called under a global lock. Then call the functions without
 697  * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  */
 700 int
 701 zone_key_delete(zone_key_t key)
 702 {
 703         struct zsd_entry *zsdp = NULL;
 704         zone_t *zone;
 705 
 706         mutex_enter(&zsd_key_lock);
 707         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708         if (zsdp == NULL) {
 709                 mutex_exit(&zsd_key_lock);
 710                 return (-1);
 711         }
 712         list_remove(&zsd_registered_keys, zsdp);
 713         mutex_exit(&zsd_key_lock);
 714 
 715         mutex_enter(&zonehash_lock);
 716         for (zone = list_head(&zone_active); zone != NULL;
 717             zone = list_next(&zone_active, zone)) {
 718                 struct zsd_entry *del;
 719 
 720                 mutex_enter(&zone->zone_lock);
 721                 del = zsd_find_mru(&zone->zone_zsd, key);
 722                 if (del == NULL) {
 723                         /*
 724                          * Somebody else got here first e.g the zone going
 725                          * away.
 726                          */
 727                         mutex_exit(&zone->zone_lock);
 728                         continue;
 729                 }
 730                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732                 if (del->zsd_shutdown != NULL &&
 733                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735                         DTRACE_PROBE2(zsd__shutdown__needed,
 736                             zone_t *, zone, zone_key_t, key);
 737                 }
 738                 if (del->zsd_destroy != NULL &&
 739                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741                         DTRACE_PROBE2(zsd__destroy__needed,
 742                             zone_t *, zone, zone_key_t, key);
 743                 }
 744                 mutex_exit(&zone->zone_lock);
 745         }
 746         mutex_exit(&zonehash_lock);
 747         kmem_free(zsdp, sizeof (*zsdp));
 748 
 749         /* Now call the shutdown and destroy callback for this key */
 750         zsd_apply_all_zones(zsd_apply_shutdown, key);
 751         zsd_apply_all_zones(zsd_apply_destroy, key);
 752 
 753         /* Now we can free up the zsdp structures in each zone */
 754         mutex_enter(&zonehash_lock);
 755         for (zone = list_head(&zone_active); zone != NULL;
 756             zone = list_next(&zone_active, zone)) {
 757                 struct zsd_entry *del;
 758 
 759                 mutex_enter(&zone->zone_lock);
 760                 del = zsd_find(&zone->zone_zsd, key);
 761                 if (del != NULL) {
 762                         list_remove(&zone->zone_zsd, del);
 763                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764                         kmem_free(del, sizeof (*del));
 765                 }
 766                 mutex_exit(&zone->zone_lock);
 767         }
 768         mutex_exit(&zonehash_lock);
 769 
 770         return (0);
 771 }
 772 
 773 /*
 774  * ZSD counterpart of pthread_setspecific().
 775  *
 776  * Since all zsd callbacks, including those with no create function,
 777  * have an entry in zone_zsd, if the key is registered it is part of
 778  * the zone_zsd list.
 779  * Return an error if the key wasn't registerd.
 780  */
 781 int
 782 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783 {
 784         struct zsd_entry *t;
 785 
 786         mutex_enter(&zone->zone_lock);
 787         t = zsd_find_mru(&zone->zone_zsd, key);
 788         if (t != NULL) {
 789                 /*
 790                  * Replace old value with new
 791                  */
 792                 t->zsd_data = (void *)data;
 793                 mutex_exit(&zone->zone_lock);
 794                 return (0);
 795         }
 796         mutex_exit(&zone->zone_lock);
 797         return (-1);
 798 }
 799 
 800 /*
 801  * ZSD counterpart of pthread_getspecific().
 802  */
 803 void *
 804 zone_getspecific(zone_key_t key, zone_t *zone)
 805 {
 806         struct zsd_entry *t;
 807         void *data;
 808 
 809         mutex_enter(&zone->zone_lock);
 810         t = zsd_find_mru(&zone->zone_zsd, key);
 811         data = (t == NULL ? NULL : t->zsd_data);
 812         mutex_exit(&zone->zone_lock);
 813         return (data);
 814 }
 815 
 816 /*
 817  * Function used to initialize a zone's list of ZSD callbacks and data
 818  * when the zone is being created.  The callbacks are initialized from
 819  * the template list (zsd_registered_keys). The constructor callback is
 820  * executed later (once the zone exists and with locks dropped).
 821  */
 822 static void
 823 zone_zsd_configure(zone_t *zone)
 824 {
 825         struct zsd_entry *zsdp;
 826         struct zsd_entry *t;
 827 
 828         ASSERT(MUTEX_HELD(&zonehash_lock));
 829         ASSERT(list_head(&zone->zone_zsd) == NULL);
 830         mutex_enter(&zone->zone_lock);
 831         mutex_enter(&zsd_key_lock);
 832         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834                 /*
 835                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836                  * should not have added anything to it.
 837                  */
 838                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839 
 840                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841                 t->zsd_key = zsdp->zsd_key;
 842                 t->zsd_create = zsdp->zsd_create;
 843                 t->zsd_shutdown = zsdp->zsd_shutdown;
 844                 t->zsd_destroy = zsdp->zsd_destroy;
 845                 if (zsdp->zsd_create != NULL) {
 846                         t->zsd_flags = ZSD_CREATE_NEEDED;
 847                         DTRACE_PROBE2(zsd__create__needed,
 848                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849                 }
 850                 list_insert_tail(&zone->zone_zsd, t);
 851         }
 852         mutex_exit(&zsd_key_lock);
 853         mutex_exit(&zone->zone_lock);
 854 }
 855 
 856 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857 
 858 /*
 859  * Helper function to execute shutdown or destructor callbacks.
 860  */
 861 static void
 862 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863 {
 864         struct zsd_entry *t;
 865 
 866         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869 
 870         /*
 871          * Run the callback solely based on what is registered for the zone
 872          * in zone_zsd. The global list can change independently of this
 873          * as keys are registered and unregistered and we don't register new
 874          * callbacks for a zone that is in the process of going away.
 875          */
 876         mutex_enter(&zone->zone_lock);
 877         for (t = list_head(&zone->zone_zsd); t != NULL;
 878             t = list_next(&zone->zone_zsd, t)) {
 879                 zone_key_t key = t->zsd_key;
 880 
 881                 /* Skip if no callbacks registered */
 882 
 883                 if (ct == ZSD_SHUTDOWN) {
 884                         if (t->zsd_shutdown != NULL &&
 885                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887                                 DTRACE_PROBE2(zsd__shutdown__needed,
 888                                     zone_t *, zone, zone_key_t, key);
 889                         }
 890                 } else {
 891                         if (t->zsd_destroy != NULL &&
 892                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894                                 DTRACE_PROBE2(zsd__destroy__needed,
 895                                     zone_t *, zone, zone_key_t, key);
 896                         }
 897                 }
 898         }
 899         mutex_exit(&zone->zone_lock);
 900 
 901         /* Now call the shutdown and destroy callback for this key */
 902         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903         zsd_apply_all_keys(zsd_apply_destroy, zone);
 904 
 905 }
 906 
 907 /*
 908  * Called when the zone is going away; free ZSD-related memory, and
 909  * destroy the zone_zsd list.
 910  */
 911 static void
 912 zone_free_zsd(zone_t *zone)
 913 {
 914         struct zsd_entry *t, *next;
 915 
 916         /*
 917          * Free all the zsd_entry's we had on this zone.
 918          */
 919         mutex_enter(&zone->zone_lock);
 920         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921                 next = list_next(&zone->zone_zsd, t);
 922                 list_remove(&zone->zone_zsd, t);
 923                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924                 kmem_free(t, sizeof (*t));
 925         }
 926         list_destroy(&zone->zone_zsd);
 927         mutex_exit(&zone->zone_lock);
 928 
 929 }
 930 
 931 /*
 932  * Apply a function to all zones for particular key value.
 933  *
 934  * The applyfn has to drop zonehash_lock if it does some work, and
 935  * then reacquire it before it returns.
 936  * When the lock is dropped we don't follow list_next even
 937  * if it is possible to do so without any hazards. This is
 938  * because we want the design to allow for the list of zones
 939  * to change in any arbitrary way during the time the
 940  * lock was dropped.
 941  *
 942  * It is safe to restart the loop at list_head since the applyfn
 943  * changes the zsd_flags as it does work, so a subsequent
 944  * pass through will have no effect in applyfn, hence the loop will terminate
 945  * in at worst O(N^2).
 946  */
 947 static void
 948 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949 {
 950         zone_t *zone;
 951 
 952         mutex_enter(&zonehash_lock);
 953         zone = list_head(&zone_active);
 954         while (zone != NULL) {
 955                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956                         /* Lock dropped - restart at head */
 957                         zone = list_head(&zone_active);
 958                 } else {
 959                         zone = list_next(&zone_active, zone);
 960                 }
 961         }
 962         mutex_exit(&zonehash_lock);
 963 }
 964 
 965 /*
 966  * Apply a function to all keys for a particular zone.
 967  *
 968  * The applyfn has to drop zonehash_lock if it does some work, and
 969  * then reacquire it before it returns.
 970  * When the lock is dropped we don't follow list_next even
 971  * if it is possible to do so without any hazards. This is
 972  * because we want the design to allow for the list of zsd callbacks
 973  * to change in any arbitrary way during the time the
 974  * lock was dropped.
 975  *
 976  * It is safe to restart the loop at list_head since the applyfn
 977  * changes the zsd_flags as it does work, so a subsequent
 978  * pass through will have no effect in applyfn, hence the loop will terminate
 979  * in at worst O(N^2).
 980  */
 981 static void
 982 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983 {
 984         struct zsd_entry *t;
 985 
 986         mutex_enter(&zone->zone_lock);
 987         t = list_head(&zone->zone_zsd);
 988         while (t != NULL) {
 989                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990                         /* Lock dropped - restart at head */
 991                         t = list_head(&zone->zone_zsd);
 992                 } else {
 993                         t = list_next(&zone->zone_zsd, t);
 994                 }
 995         }
 996         mutex_exit(&zone->zone_lock);
 997 }
 998 
 999 /*
1000  * Call the create function for the zone and key if CREATE_NEEDED
1001  * is set.
1002  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003  * we wait for that thread to complete so that we can ensure that
1004  * all the callbacks are done when we've looped over all zones/keys.
1005  *
1006  * When we call the create function, we drop the global held by the
1007  * caller, and return true to tell the caller it needs to re-evalute the
1008  * state.
1009  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010  * remains held on exit.
1011  */
1012 static boolean_t
1013 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014     zone_t *zone, zone_key_t key)
1015 {
1016         void *result;
1017         struct zsd_entry *t;
1018         boolean_t dropped;
1019 
1020         if (lockp != NULL) {
1021                 ASSERT(MUTEX_HELD(lockp));
1022         }
1023         if (zone_lock_held) {
1024                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1025         } else {
1026                 mutex_enter(&zone->zone_lock);
1027         }
1028 
1029         t = zsd_find(&zone->zone_zsd, key);
1030         if (t == NULL) {
1031                 /*
1032                  * Somebody else got here first e.g the zone going
1033                  * away.
1034                  */
1035                 if (!zone_lock_held)
1036                         mutex_exit(&zone->zone_lock);
1037                 return (B_FALSE);
1038         }
1039         dropped = B_FALSE;
1040         if (zsd_wait_for_inprogress(zone, t, lockp))
1041                 dropped = B_TRUE;
1042 
1043         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046                 DTRACE_PROBE2(zsd__create__inprogress,
1047                     zone_t *, zone, zone_key_t, key);
1048                 mutex_exit(&zone->zone_lock);
1049                 if (lockp != NULL)
1050                         mutex_exit(lockp);
1051 
1052                 dropped = B_TRUE;
1053                 ASSERT(t->zsd_create != NULL);
1054                 DTRACE_PROBE2(zsd__create__start,
1055                     zone_t *, zone, zone_key_t, key);
1056 
1057                 result = (*t->zsd_create)(zone->zone_id);
1058 
1059                 DTRACE_PROBE2(zsd__create__end,
1060                     zone_t *, zone, voidn *, result);
1061 
1062                 ASSERT(result != NULL);
1063                 if (lockp != NULL)
1064                         mutex_enter(lockp);
1065                 mutex_enter(&zone->zone_lock);
1066                 t->zsd_data = result;
1067                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069                 cv_broadcast(&t->zsd_cv);
1070                 DTRACE_PROBE2(zsd__create__completed,
1071                     zone_t *, zone, zone_key_t, key);
1072         }
1073         if (!zone_lock_held)
1074                 mutex_exit(&zone->zone_lock);
1075         return (dropped);
1076 }
1077 
1078 /*
1079  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080  * is set.
1081  * If some other thread gets here first and sets *_INPROGRESS, then
1082  * we wait for that thread to complete so that we can ensure that
1083  * all the callbacks are done when we've looped over all zones/keys.
1084  *
1085  * When we call the shutdown function, we drop the global held by the
1086  * caller, and return true to tell the caller it needs to re-evalute the
1087  * state.
1088  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089  * remains held on exit.
1090  */
1091 static boolean_t
1092 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093     zone_t *zone, zone_key_t key)
1094 {
1095         struct zsd_entry *t;
1096         void *data;
1097         boolean_t dropped;
1098 
1099         if (lockp != NULL) {
1100                 ASSERT(MUTEX_HELD(lockp));
1101         }
1102         if (zone_lock_held) {
1103                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1104         } else {
1105                 mutex_enter(&zone->zone_lock);
1106         }
1107 
1108         t = zsd_find(&zone->zone_zsd, key);
1109         if (t == NULL) {
1110                 /*
1111                  * Somebody else got here first e.g the zone going
1112                  * away.
1113                  */
1114                 if (!zone_lock_held)
1115                         mutex_exit(&zone->zone_lock);
1116                 return (B_FALSE);
1117         }
1118         dropped = B_FALSE;
1119         if (zsd_wait_for_creator(zone, t, lockp))
1120                 dropped = B_TRUE;
1121 
1122         if (zsd_wait_for_inprogress(zone, t, lockp))
1123                 dropped = B_TRUE;
1124 
1125         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1129                     zone_t *, zone, zone_key_t, key);
1130                 mutex_exit(&zone->zone_lock);
1131                 if (lockp != NULL)
1132                         mutex_exit(lockp);
1133                 dropped = B_TRUE;
1134 
1135                 ASSERT(t->zsd_shutdown != NULL);
1136                 data = t->zsd_data;
1137 
1138                 DTRACE_PROBE2(zsd__shutdown__start,
1139                     zone_t *, zone, zone_key_t, key);
1140 
1141                 (t->zsd_shutdown)(zone->zone_id, data);
1142                 DTRACE_PROBE2(zsd__shutdown__end,
1143                     zone_t *, zone, zone_key_t, key);
1144 
1145                 if (lockp != NULL)
1146                         mutex_enter(lockp);
1147                 mutex_enter(&zone->zone_lock);
1148                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150                 cv_broadcast(&t->zsd_cv);
1151                 DTRACE_PROBE2(zsd__shutdown__completed,
1152                     zone_t *, zone, zone_key_t, key);
1153         }
1154         if (!zone_lock_held)
1155                 mutex_exit(&zone->zone_lock);
1156         return (dropped);
1157 }
1158 
1159 /*
1160  * Call the destroy function for the zone and key if DESTROY_NEEDED
1161  * is set.
1162  * If some other thread gets here first and sets *_INPROGRESS, then
1163  * we wait for that thread to complete so that we can ensure that
1164  * all the callbacks are done when we've looped over all zones/keys.
1165  *
1166  * When we call the destroy function, we drop the global held by the
1167  * caller, and return true to tell the caller it needs to re-evalute the
1168  * state.
1169  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170  * remains held on exit.
1171  */
1172 static boolean_t
1173 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174     zone_t *zone, zone_key_t key)
1175 {
1176         struct zsd_entry *t;
1177         void *data;
1178         boolean_t dropped;
1179 
1180         if (lockp != NULL) {
1181                 ASSERT(MUTEX_HELD(lockp));
1182         }
1183         if (zone_lock_held) {
1184                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1185         } else {
1186                 mutex_enter(&zone->zone_lock);
1187         }
1188 
1189         t = zsd_find(&zone->zone_zsd, key);
1190         if (t == NULL) {
1191                 /*
1192                  * Somebody else got here first e.g the zone going
1193                  * away.
1194                  */
1195                 if (!zone_lock_held)
1196                         mutex_exit(&zone->zone_lock);
1197                 return (B_FALSE);
1198         }
1199         dropped = B_FALSE;
1200         if (zsd_wait_for_creator(zone, t, lockp))
1201                 dropped = B_TRUE;
1202 
1203         if (zsd_wait_for_inprogress(zone, t, lockp))
1204                 dropped = B_TRUE;
1205 
1206         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209                 DTRACE_PROBE2(zsd__destroy__inprogress,
1210                     zone_t *, zone, zone_key_t, key);
1211                 mutex_exit(&zone->zone_lock);
1212                 if (lockp != NULL)
1213                         mutex_exit(lockp);
1214                 dropped = B_TRUE;
1215 
1216                 ASSERT(t->zsd_destroy != NULL);
1217                 data = t->zsd_data;
1218                 DTRACE_PROBE2(zsd__destroy__start,
1219                     zone_t *, zone, zone_key_t, key);
1220 
1221                 (t->zsd_destroy)(zone->zone_id, data);
1222                 DTRACE_PROBE2(zsd__destroy__end,
1223                     zone_t *, zone, zone_key_t, key);
1224 
1225                 if (lockp != NULL)
1226                         mutex_enter(lockp);
1227                 mutex_enter(&zone->zone_lock);
1228                 t->zsd_data = NULL;
1229                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231                 cv_broadcast(&t->zsd_cv);
1232                 DTRACE_PROBE2(zsd__destroy__completed,
1233                     zone_t *, zone, zone_key_t, key);
1234         }
1235         if (!zone_lock_held)
1236                 mutex_exit(&zone->zone_lock);
1237         return (dropped);
1238 }
1239 
1240 /*
1241  * Wait for any CREATE_NEEDED flag to be cleared.
1242  * Returns true if lockp was temporarily dropped while waiting.
1243  */
1244 static boolean_t
1245 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 {
1247         boolean_t dropped = B_FALSE;
1248 
1249         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250                 DTRACE_PROBE2(zsd__wait__for__creator,
1251                     zone_t *, zone, struct zsd_entry *, t);
1252                 if (lockp != NULL) {
1253                         dropped = B_TRUE;
1254                         mutex_exit(lockp);
1255                 }
1256                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1257                 if (lockp != NULL) {
1258                         /* First drop zone_lock to preserve order */
1259                         mutex_exit(&zone->zone_lock);
1260                         mutex_enter(lockp);
1261                         mutex_enter(&zone->zone_lock);
1262                 }
1263         }
1264         return (dropped);
1265 }
1266 
1267 /*
1268  * Wait for any INPROGRESS flag to be cleared.
1269  * Returns true if lockp was temporarily dropped while waiting.
1270  */
1271 static boolean_t
1272 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 {
1274         boolean_t dropped = B_FALSE;
1275 
1276         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1278                     zone_t *, zone, struct zsd_entry *, t);
1279                 if (lockp != NULL) {
1280                         dropped = B_TRUE;
1281                         mutex_exit(lockp);
1282                 }
1283                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1284                 if (lockp != NULL) {
1285                         /* First drop zone_lock to preserve order */
1286                         mutex_exit(&zone->zone_lock);
1287                         mutex_enter(lockp);
1288                         mutex_enter(&zone->zone_lock);
1289                 }
1290         }
1291         return (dropped);
1292 }
1293 
1294 /*
1295  * Frees memory associated with the zone dataset list.
1296  */
1297 static void
1298 zone_free_datasets(zone_t *zone)
1299 {
1300         zone_dataset_t *t, *next;
1301 
1302         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303                 next = list_next(&zone->zone_datasets, t);
1304                 list_remove(&zone->zone_datasets, t);
1305                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306                 kmem_free(t, sizeof (*t));
1307         }
1308         list_destroy(&zone->zone_datasets);
1309 }
1310 
1311 /*
1312  * zone.cpu-shares resource control support.
1313  */
1314 /*ARGSUSED*/
1315 static rctl_qty_t
1316 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 {
1318         ASSERT(MUTEX_HELD(&p->p_lock));
1319         return (p->p_zone->zone_shares);
1320 }
1321 
1322 /*ARGSUSED*/
1323 static int
1324 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325     rctl_qty_t nv)
1326 {
1327         ASSERT(MUTEX_HELD(&p->p_lock));
1328         ASSERT(e->rcep_t == RCENTITY_ZONE);
1329         if (e->rcep_p.zone == NULL)
1330                 return (0);
1331 
1332         e->rcep_p.zone->zone_shares = nv;
1333         return (0);
1334 }
1335 
1336 static rctl_ops_t zone_cpu_shares_ops = {
1337         rcop_no_action,
1338         zone_cpu_shares_usage,
1339         zone_cpu_shares_set,
1340         rcop_no_test
1341 };
1342 
1343 /*
1344  * zone.cpu-cap resource control support.
1345  */
1346 /*ARGSUSED*/
1347 static rctl_qty_t
1348 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 {
1350         ASSERT(MUTEX_HELD(&p->p_lock));
1351         return (cpucaps_zone_get(p->p_zone));
1352 }
1353 
1354 /*ARGSUSED*/
1355 static int
1356 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357     rctl_qty_t nv)
1358 {
1359         zone_t *zone = e->rcep_p.zone;
1360 
1361         ASSERT(MUTEX_HELD(&p->p_lock));
1362         ASSERT(e->rcep_t == RCENTITY_ZONE);
1363 
1364         if (zone == NULL)
1365                 return (0);
1366 
1367         /*
1368          * set cap to the new value.
1369          */
1370         return (cpucaps_zone_set(zone, nv));
1371 }
1372 
1373 static rctl_ops_t zone_cpu_cap_ops = {
1374         rcop_no_action,
1375         zone_cpu_cap_get,
1376         zone_cpu_cap_set,
1377         rcop_no_test
1378 };
1379 
1380 /*ARGSUSED*/
1381 static rctl_qty_t
1382 zone_lwps_usage(rctl_t *r, proc_t *p)
1383 {
1384         rctl_qty_t nlwps;
1385         zone_t *zone = p->p_zone;
1386 
1387         ASSERT(MUTEX_HELD(&p->p_lock));
1388 
1389         mutex_enter(&zone->zone_nlwps_lock);
1390         nlwps = zone->zone_nlwps;
1391         mutex_exit(&zone->zone_nlwps_lock);
1392 
1393         return (nlwps);
1394 }
1395 
1396 /*ARGSUSED*/
1397 static int
1398 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399     rctl_qty_t incr, uint_t flags)
1400 {
1401         rctl_qty_t nlwps;
1402 
1403         ASSERT(MUTEX_HELD(&p->p_lock));
1404         ASSERT(e->rcep_t == RCENTITY_ZONE);
1405         if (e->rcep_p.zone == NULL)
1406                 return (0);
1407         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408         nlwps = e->rcep_p.zone->zone_nlwps;
1409 
1410         if (nlwps + incr > rcntl->rcv_value)
1411                 return (1);
1412 
1413         return (0);
1414 }
1415 
1416 /*ARGSUSED*/
1417 static int
1418 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 {
1420         ASSERT(MUTEX_HELD(&p->p_lock));
1421         ASSERT(e->rcep_t == RCENTITY_ZONE);
1422         if (e->rcep_p.zone == NULL)
1423                 return (0);
1424         e->rcep_p.zone->zone_nlwps_ctl = nv;
1425         return (0);
1426 }
1427 
1428 static rctl_ops_t zone_lwps_ops = {
1429         rcop_no_action,
1430         zone_lwps_usage,
1431         zone_lwps_set,
1432         zone_lwps_test,
1433 };
1434 
1435 /*ARGSUSED*/
1436 static rctl_qty_t
1437 zone_procs_usage(rctl_t *r, proc_t *p)
1438 {
1439         rctl_qty_t nprocs;
1440         zone_t *zone = p->p_zone;
1441 
1442         ASSERT(MUTEX_HELD(&p->p_lock));
1443 
1444         mutex_enter(&zone->zone_nlwps_lock);
1445         nprocs = zone->zone_nprocs;
1446         mutex_exit(&zone->zone_nlwps_lock);
1447 
1448         return (nprocs);
1449 }
1450 
1451 /*ARGSUSED*/
1452 static int
1453 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454     rctl_qty_t incr, uint_t flags)
1455 {
1456         rctl_qty_t nprocs;
1457 
1458         ASSERT(MUTEX_HELD(&p->p_lock));
1459         ASSERT(e->rcep_t == RCENTITY_ZONE);
1460         if (e->rcep_p.zone == NULL)
1461                 return (0);
1462         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463         nprocs = e->rcep_p.zone->zone_nprocs;
1464 
1465         if (nprocs + incr > rcntl->rcv_value)
1466                 return (1);
1467 
1468         return (0);
1469 }
1470 
1471 /*ARGSUSED*/
1472 static int
1473 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 {
1475         ASSERT(MUTEX_HELD(&p->p_lock));
1476         ASSERT(e->rcep_t == RCENTITY_ZONE);
1477         if (e->rcep_p.zone == NULL)
1478                 return (0);
1479         e->rcep_p.zone->zone_nprocs_ctl = nv;
1480         return (0);
1481 }
1482 
1483 static rctl_ops_t zone_procs_ops = {
1484         rcop_no_action,
1485         zone_procs_usage,
1486         zone_procs_set,
1487         zone_procs_test,
1488 };
1489 
1490 /*ARGSUSED*/
1491 static rctl_qty_t
1492 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 {
1494         ASSERT(MUTEX_HELD(&p->p_lock));
1495         return (p->p_zone->zone_shmmax);
1496 }
1497 
1498 /*ARGSUSED*/
1499 static int
1500 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501     rctl_qty_t incr, uint_t flags)
1502 {
1503         rctl_qty_t v;
1504         ASSERT(MUTEX_HELD(&p->p_lock));
1505         ASSERT(e->rcep_t == RCENTITY_ZONE);
1506         v = e->rcep_p.zone->zone_shmmax + incr;
1507         if (v > rval->rcv_value)
1508                 return (1);
1509         return (0);
1510 }
1511 
1512 static rctl_ops_t zone_shmmax_ops = {
1513         rcop_no_action,
1514         zone_shmmax_usage,
1515         rcop_no_set,
1516         zone_shmmax_test
1517 };
1518 
1519 /*ARGSUSED*/
1520 static rctl_qty_t
1521 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 {
1523         ASSERT(MUTEX_HELD(&p->p_lock));
1524         return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 }
1526 
1527 /*ARGSUSED*/
1528 static int
1529 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530     rctl_qty_t incr, uint_t flags)
1531 {
1532         rctl_qty_t v;
1533         ASSERT(MUTEX_HELD(&p->p_lock));
1534         ASSERT(e->rcep_t == RCENTITY_ZONE);
1535         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536         if (v > rval->rcv_value)
1537                 return (1);
1538         return (0);
1539 }
1540 
1541 static rctl_ops_t zone_shmmni_ops = {
1542         rcop_no_action,
1543         zone_shmmni_usage,
1544         rcop_no_set,
1545         zone_shmmni_test
1546 };
1547 
1548 /*ARGSUSED*/
1549 static rctl_qty_t
1550 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 {
1552         ASSERT(MUTEX_HELD(&p->p_lock));
1553         return (p->p_zone->zone_ipc.ipcq_semmni);
1554 }
1555 
1556 /*ARGSUSED*/
1557 static int
1558 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559     rctl_qty_t incr, uint_t flags)
1560 {
1561         rctl_qty_t v;
1562         ASSERT(MUTEX_HELD(&p->p_lock));
1563         ASSERT(e->rcep_t == RCENTITY_ZONE);
1564         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565         if (v > rval->rcv_value)
1566                 return (1);
1567         return (0);
1568 }
1569 
1570 static rctl_ops_t zone_semmni_ops = {
1571         rcop_no_action,
1572         zone_semmni_usage,
1573         rcop_no_set,
1574         zone_semmni_test
1575 };
1576 
1577 /*ARGSUSED*/
1578 static rctl_qty_t
1579 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 {
1581         ASSERT(MUTEX_HELD(&p->p_lock));
1582         return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 }
1584 
1585 /*ARGSUSED*/
1586 static int
1587 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588     rctl_qty_t incr, uint_t flags)
1589 {
1590         rctl_qty_t v;
1591         ASSERT(MUTEX_HELD(&p->p_lock));
1592         ASSERT(e->rcep_t == RCENTITY_ZONE);
1593         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594         if (v > rval->rcv_value)
1595                 return (1);
1596         return (0);
1597 }
1598 
1599 static rctl_ops_t zone_msgmni_ops = {
1600         rcop_no_action,
1601         zone_msgmni_usage,
1602         rcop_no_set,
1603         zone_msgmni_test
1604 };
1605 
1606 /*ARGSUSED*/
1607 static rctl_qty_t
1608 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 {
1610         rctl_qty_t q;
1611         ASSERT(MUTEX_HELD(&p->p_lock));
1612         mutex_enter(&p->p_zone->zone_mem_lock);
1613         q = p->p_zone->zone_locked_mem;
1614         mutex_exit(&p->p_zone->zone_mem_lock);
1615         return (q);
1616 }
1617 
1618 /*ARGSUSED*/
1619 static int
1620 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 {
1623         rctl_qty_t q;
1624         zone_t *z;
1625 
1626         z = e->rcep_p.zone;
1627         ASSERT(MUTEX_HELD(&p->p_lock));
1628         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629         q = z->zone_locked_mem;
1630         if (q + incr > rcntl->rcv_value)
1631                 return (1);
1632         return (0);
1633 }
1634 
1635 /*ARGSUSED*/
1636 static int
1637 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638     rctl_qty_t nv)
1639 {
1640         ASSERT(MUTEX_HELD(&p->p_lock));
1641         ASSERT(e->rcep_t == RCENTITY_ZONE);
1642         if (e->rcep_p.zone == NULL)
1643                 return (0);
1644         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645         return (0);
1646 }
1647 
1648 static rctl_ops_t zone_locked_mem_ops = {
1649         rcop_no_action,
1650         zone_locked_mem_usage,
1651         zone_locked_mem_set,
1652         zone_locked_mem_test
1653 };
1654 
1655 /*ARGSUSED*/
1656 static rctl_qty_t
1657 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 {
1659         rctl_qty_t q;
1660         zone_t *z = p->p_zone;
1661 
1662         ASSERT(MUTEX_HELD(&p->p_lock));
1663         mutex_enter(&z->zone_mem_lock);
1664         q = z->zone_max_swap;
1665         mutex_exit(&z->zone_mem_lock);
1666         return (q);
1667 }
1668 
1669 /*ARGSUSED*/
1670 static int
1671 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 {
1674         rctl_qty_t q;
1675         zone_t *z;
1676 
1677         z = e->rcep_p.zone;
1678         ASSERT(MUTEX_HELD(&p->p_lock));
1679         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680         q = z->zone_max_swap;
1681         if (q + incr > rcntl->rcv_value)
1682                 return (1);
1683         return (0);
1684 }
1685 
1686 /*ARGSUSED*/
1687 static int
1688 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689     rctl_qty_t nv)
1690 {
1691         ASSERT(MUTEX_HELD(&p->p_lock));
1692         ASSERT(e->rcep_t == RCENTITY_ZONE);
1693         if (e->rcep_p.zone == NULL)
1694                 return (0);
1695         e->rcep_p.zone->zone_max_swap_ctl = nv;
1696         return (0);
1697 }
1698 
1699 static rctl_ops_t zone_max_swap_ops = {
1700         rcop_no_action,
1701         zone_max_swap_usage,
1702         zone_max_swap_set,
1703         zone_max_swap_test
1704 };
1705 
1706 /*ARGSUSED*/
1707 static rctl_qty_t
1708 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 {
1710         rctl_qty_t q;
1711         zone_t *z = p->p_zone;
1712 
1713         ASSERT(MUTEX_HELD(&p->p_lock));
1714         mutex_enter(&z->zone_rctl_lock);
1715         q = z->zone_max_lofi;
1716         mutex_exit(&z->zone_rctl_lock);
1717         return (q);
1718 }
1719 
1720 /*ARGSUSED*/
1721 static int
1722 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 {
1725         rctl_qty_t q;
1726         zone_t *z;
1727 
1728         z = e->rcep_p.zone;
1729         ASSERT(MUTEX_HELD(&p->p_lock));
1730         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731         q = z->zone_max_lofi;
1732         if (q + incr > rcntl->rcv_value)
1733                 return (1);
1734         return (0);
1735 }
1736 
1737 /*ARGSUSED*/
1738 static int
1739 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740     rctl_qty_t nv)
1741 {
1742         ASSERT(MUTEX_HELD(&p->p_lock));
1743         ASSERT(e->rcep_t == RCENTITY_ZONE);
1744         if (e->rcep_p.zone == NULL)
1745                 return (0);
1746         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747         return (0);
1748 }
1749 
1750 static rctl_ops_t zone_max_lofi_ops = {
1751         rcop_no_action,
1752         zone_max_lofi_usage,
1753         zone_max_lofi_set,
1754         zone_max_lofi_test
1755 };
1756 
1757 /*
1758  * Helper function to brand the zone with a unique ID.
1759  */
1760 static void
1761 zone_uniqid(zone_t *zone)
1762 {
1763         static uint64_t uniqid = 0;
1764 
1765         ASSERT(MUTEX_HELD(&zonehash_lock));
1766         zone->zone_uniqid = uniqid++;
1767 }
1768 
1769 /*
1770  * Returns a held pointer to the "kcred" for the specified zone.
1771  */
1772 struct cred *
1773 zone_get_kcred(zoneid_t zoneid)
1774 {
1775         zone_t *zone;
1776         cred_t *cr;
1777 
1778         if ((zone = zone_find_by_id(zoneid)) == NULL)
1779                 return (NULL);
1780         cr = zone->zone_kcred;
1781         crhold(cr);
1782         zone_rele(zone);
1783         return (cr);
1784 }
1785 
1786 static int
1787 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 {
1789         zone_t *zone = ksp->ks_private;
1790         zone_kstat_t *zk = ksp->ks_data;
1791 
1792         if (rw == KSTAT_WRITE)
1793                 return (EACCES);
1794 
1795         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797         return (0);
1798 }
1799 
1800 static int
1801 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 {
1803         zone_t *zone = ksp->ks_private;
1804         zone_kstat_t *zk = ksp->ks_data;
1805 
1806         if (rw == KSTAT_WRITE)
1807                 return (EACCES);
1808 
1809         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811         return (0);
1812 }
1813 
1814 static int
1815 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 {
1817         zone_t *zone = ksp->ks_private;
1818         zone_kstat_t *zk = ksp->ks_data;
1819 
1820         if (rw == KSTAT_WRITE)
1821                 return (EACCES);
1822 
1823         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825         return (0);
1826 }
1827 
1828 static kstat_t *
1829 zone_kstat_create_common(zone_t *zone, char *name,
1830     int (*updatefunc) (kstat_t *, int))
1831 {
1832         kstat_t *ksp;
1833         zone_kstat_t *zk;
1834 
1835         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837             KSTAT_FLAG_VIRTUAL);
1838 
1839         if (ksp == NULL)
1840                 return (NULL);
1841 
1842         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848         ksp->ks_update = updatefunc;
1849         ksp->ks_private = zone;
1850         kstat_install(ksp);
1851         return (ksp);
1852 }
1853 
1854 
1855 static int
1856 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 {
1858         zone_t *zone = ksp->ks_private;
1859         zone_mcap_kstat_t *zmp = ksp->ks_data;
1860 
1861         if (rw == KSTAT_WRITE)
1862                 return (EACCES);
1863 
1864         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869 
1870         return (0);
1871 }
1872 
1873 static kstat_t *
1874 zone_mcap_kstat_create(zone_t *zone)
1875 {
1876         kstat_t *ksp;
1877         zone_mcap_kstat_t *zmp;
1878 
1879         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883                 return (NULL);
1884 
1885         if (zone->zone_id != GLOBAL_ZONEID)
1886                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1887 
1888         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890         ksp->ks_lock = &zone->zone_mcap_lock;
1891         zone->zone_mcap_stats = zmp;
1892 
1893         /* The kstat "name" field is not large enough for a full zonename */
1894         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901             KSTAT_DATA_UINT64);
1902 
1903         ksp->ks_update = zone_mcap_kstat_update;
1904         ksp->ks_private = zone;
1905 
1906         kstat_install(ksp);
1907         return (ksp);
1908 }
1909 
1910 static int
1911 zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 {
1913         zone_t *zone = ksp->ks_private;
1914         zone_misc_kstat_t *zmp = ksp->ks_data;
1915         hrtime_t hrtime;
1916         uint64_t tmp;
1917 
1918         if (rw == KSTAT_WRITE)
1919                 return (EACCES);
1920 
1921         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
1922         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1923         scalehrtime(&hrtime);
1924         zmp->zm_stime.value.ui64 = hrtime;
1925 
1926         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
1927         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1928         scalehrtime(&hrtime);
1929         zmp->zm_utime.value.ui64 = hrtime;
1930 
1931         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
1932         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1933         scalehrtime(&hrtime);
1934         zmp->zm_wtime.value.ui64 = hrtime;
1935 
1936         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1937         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1938         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1939 
1940         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1941         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1942         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1943         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1944 
1945         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1946 
1947         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1948         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1949 
1950         return (0);
1951 }
1952 
1953 static kstat_t *
1954 zone_misc_kstat_create(zone_t *zone)
1955 {
1956         kstat_t *ksp;
1957         zone_misc_kstat_t *zmp;
1958 
1959         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1960             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1961             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1962             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1963                 return (NULL);
1964 
1965         if (zone->zone_id != GLOBAL_ZONEID)
1966                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1967 
1968         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1969         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1970         ksp->ks_lock = &zone->zone_misc_lock;
1971         zone->zone_misc_stats = zmp;
1972 
1973         /* The kstat "name" field is not large enough for a full zonename */
1974         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1975         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1976         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1977         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1978         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1979         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1980         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1982             KSTAT_DATA_UINT32);
1983         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1985             KSTAT_DATA_UINT32);
1986         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1987         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1988         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1989             KSTAT_DATA_UINT32);
1990         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1991         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1992 
1993         ksp->ks_update = zone_misc_kstat_update;
1994         ksp->ks_private = zone;
1995 
1996         kstat_install(ksp);
1997         return (ksp);
1998 }
1999 
2000 static void
2001 zone_kstat_create(zone_t *zone)
2002 {
2003         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2004             "lockedmem", zone_lockedmem_kstat_update);
2005         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2006             "swapresv", zone_swapresv_kstat_update);
2007         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2008             "nprocs", zone_nprocs_kstat_update);
2009 
2010         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2011                 zone->zone_mcap_stats = kmem_zalloc(
2012                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2013         }
2014 
2015         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2016                 zone->zone_misc_stats = kmem_zalloc(
2017                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2018         }
2019 }
2020 
2021 static void
2022 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2023 {
2024         void *data;
2025 
2026         if (*pkstat != NULL) {
2027                 data = (*pkstat)->ks_data;
2028                 kstat_delete(*pkstat);
2029                 kmem_free(data, datasz);
2030                 *pkstat = NULL;
2031         }
2032 }
2033 
2034 static void
2035 zone_kstat_delete(zone_t *zone)
2036 {
2037         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2038             sizeof (zone_kstat_t));
2039         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2040             sizeof (zone_kstat_t));
2041         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2042             sizeof (zone_kstat_t));
2043         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2044             sizeof (zone_mcap_kstat_t));
2045         zone_kstat_delete_common(&zone->zone_misc_ksp,
2046             sizeof (zone_misc_kstat_t));
2047 }
2048 
2049 /*
2050  * Called very early on in boot to initialize the ZSD list so that
2051  * zone_key_create() can be called before zone_init().  It also initializes
2052  * portions of zone0 which may be used before zone_init() is called.  The
2053  * variable "global_zone" will be set when zone0 is fully initialized by
2054  * zone_init().
2055  */
2056 void
2057 zone_zsd_init(void)
2058 {
2059         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2060         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2061         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2062             offsetof(struct zsd_entry, zsd_linkage));
2063         list_create(&zone_active, sizeof (zone_t),
2064             offsetof(zone_t, zone_linkage));
2065         list_create(&zone_deathrow, sizeof (zone_t),
2066             offsetof(zone_t, zone_linkage));
2067 
2068         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2069         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2070         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2071         zone0.zone_shares = 1;
2072         zone0.zone_nlwps = 0;
2073         zone0.zone_nlwps_ctl = INT_MAX;
2074         zone0.zone_nprocs = 0;
2075         zone0.zone_nprocs_ctl = INT_MAX;
2076         zone0.zone_locked_mem = 0;
2077         zone0.zone_locked_mem_ctl = UINT64_MAX;
2078         ASSERT(zone0.zone_max_swap == 0);
2079         zone0.zone_max_swap_ctl = UINT64_MAX;
2080         zone0.zone_max_lofi = 0;
2081         zone0.zone_max_lofi_ctl = UINT64_MAX;
2082         zone0.zone_shmmax = 0;
2083         zone0.zone_ipc.ipcq_shmmni = 0;
2084         zone0.zone_ipc.ipcq_semmni = 0;
2085         zone0.zone_ipc.ipcq_msgmni = 0;
2086         zone0.zone_name = GLOBAL_ZONENAME;
2087         zone0.zone_nodename = utsname.nodename;
2088         zone0.zone_domain = srpc_domain;
2089         zone0.zone_hostid = HW_INVALID_HOSTID;
2090         zone0.zone_fs_allowed = NULL;
2091         psecflags_default(&zone0.zone_secflags);
2092         zone0.zone_ref = 1;
2093         zone0.zone_id = GLOBAL_ZONEID;
2094         zone0.zone_status = ZONE_IS_RUNNING;
2095         zone0.zone_rootpath = "/";
2096         zone0.zone_rootpathlen = 2;
2097         zone0.zone_psetid = ZONE_PS_INVAL;
2098         zone0.zone_ncpus = 0;
2099         zone0.zone_ncpus_online = 0;
2100         zone0.zone_proc_initpid = 1;
2101         zone0.zone_initname = initname;
2102         zone0.zone_lockedmem_kstat = NULL;
2103         zone0.zone_swapresv_kstat = NULL;
2104         zone0.zone_nprocs_kstat = NULL;
2105 
2106         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2107             offsetof(zone_ref_t, zref_linkage));
2108         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2109             offsetof(struct zsd_entry, zsd_linkage));
2110         list_insert_head(&zone_active, &zone0);
2111 
2112         /*
2113          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2114          * to anything meaningful.  It is assigned to be 'rootdir' in
2115          * vfs_mountroot().
2116          */
2117         zone0.zone_rootvp = NULL;
2118         zone0.zone_vfslist = NULL;
2119         zone0.zone_bootargs = initargs;
2120         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2121         /*
2122          * The global zone has all privileges
2123          */
2124         priv_fillset(zone0.zone_privset);
2125         /*
2126          * Add p0 to the global zone
2127          */
2128         zone0.zone_zsched = &p0;
2129         p0.p_zone = &zone0;
2130 }
2131 
2132 /*
2133  * Compute a hash value based on the contents of the label and the DOI.  The
2134  * hash algorithm is somewhat arbitrary, but is based on the observation that
2135  * humans will likely pick labels that differ by amounts that work out to be
2136  * multiples of the number of hash chains, and thus stirring in some primes
2137  * should help.
2138  */
2139 static uint_t
2140 hash_bylabel(void *hdata, mod_hash_key_t key)
2141 {
2142         const ts_label_t *lab = (ts_label_t *)key;
2143         const uint32_t *up, *ue;
2144         uint_t hash;
2145         int i;
2146 
2147         _NOTE(ARGUNUSED(hdata));
2148 
2149         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2150         /* we depend on alignment of label, but not representation */
2151         up = (const uint32_t *)&lab->tsl_label;
2152         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2153         i = 1;
2154         while (up < ue) {
2155                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2156                 hash += *up + (*up << ((i % 16) + 1));
2157                 up++;
2158                 i++;
2159         }
2160         return (hash);
2161 }
2162 
2163 /*
2164  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2165  * equal).  This may need to be changed if less than / greater than is ever
2166  * needed.
2167  */
2168 static int
2169 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2170 {
2171         ts_label_t *lab1 = (ts_label_t *)key1;
2172         ts_label_t *lab2 = (ts_label_t *)key2;
2173 
2174         return (label_equal(lab1, lab2) ? 0 : 1);
2175 }
2176 
2177 /*
2178  * Called by main() to initialize the zones framework.
2179  */
2180 void
2181 zone_init(void)
2182 {
2183         rctl_dict_entry_t *rde;
2184         rctl_val_t *dval;
2185         rctl_set_t *set;
2186         rctl_alloc_gp_t *gp;
2187         rctl_entity_p_t e;
2188         int res;
2189 
2190         ASSERT(curproc == &p0);
2191 
2192         /*
2193          * Create ID space for zone IDs.  ID 0 is reserved for the
2194          * global zone.
2195          */
2196         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2197 
2198         /*
2199          * Initialize generic zone resource controls, if any.
2200          */
2201         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2202             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2203             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2204             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2205 
2206         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2207             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2208             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2209             RCTL_GLOBAL_INFINITE,
2210             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2211 
2212         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2213             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2214             INT_MAX, INT_MAX, &zone_lwps_ops);
2215 
2216         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2217             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2218             INT_MAX, INT_MAX, &zone_procs_ops);
2219 
2220         /*
2221          * System V IPC resource controls
2222          */
2223         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2224             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2225             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2226 
2227         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2228             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2229             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2230 
2231         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2232             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2233             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2234 
2235         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2236             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2237             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2238 
2239         /*
2240          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2241          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2242          */
2243         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2244         bzero(dval, sizeof (rctl_val_t));
2245         dval->rcv_value = 1;
2246         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2247         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2248         dval->rcv_action_recip_pid = -1;
2249 
2250         rde = rctl_dict_lookup("zone.cpu-shares");
2251         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2252 
2253         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2254             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2255             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2256             &zone_locked_mem_ops);
2257 
2258         rc_zone_max_swap = rctl_register("zone.max-swap",
2259             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2260             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2261             &zone_max_swap_ops);
2262 
2263         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2264             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2265             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2266             &zone_max_lofi_ops);
2267 
2268         /*
2269          * Initialize the ``global zone''.
2270          */
2271         set = rctl_set_create();
2272         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2273         mutex_enter(&p0.p_lock);
2274         e.rcep_p.zone = &zone0;
2275         e.rcep_t = RCENTITY_ZONE;
2276         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2277             gp);
2278 
2279         zone0.zone_nlwps = p0.p_lwpcnt;
2280         zone0.zone_nprocs = 1;
2281         zone0.zone_ntasks = 1;
2282         mutex_exit(&p0.p_lock);
2283         zone0.zone_restart_init = B_TRUE;
2284         zone0.zone_brand = &native_brand;
2285         rctl_prealloc_destroy(gp);
2286         /*
2287          * pool_default hasn't been initialized yet, so we let pool_init()
2288          * take care of making sure the global zone is in the default pool.
2289          */
2290 
2291         /*
2292          * Initialize global zone kstats
2293          */
2294         zone_kstat_create(&zone0);
2295 
2296         /*
2297          * Initialize zone label.
2298          * mlp are initialized when tnzonecfg is loaded.
2299          */
2300         zone0.zone_slabel = l_admin_low;
2301         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2302         label_hold(l_admin_low);
2303 
2304         /*
2305          * Initialise the lock for the database structure used by mntfs.
2306          */
2307         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2308 
2309         zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2310 
2311         mutex_enter(&zonehash_lock);
2312         zone_uniqid(&zone0);
2313         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2314 
2315         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2316             mod_hash_null_valdtor);
2317         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2318             zone_hash_size, mod_hash_null_valdtor);
2319         /*
2320          * maintain zonehashbylabel only for labeled systems
2321          */
2322         if (is_system_labeled())
2323                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2324                     zone_hash_size, mod_hash_null_keydtor,
2325                     mod_hash_null_valdtor, hash_bylabel, NULL,
2326                     hash_labelkey_cmp, KM_SLEEP);
2327         zonecount = 1;
2328 
2329         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2330             (mod_hash_val_t)&zone0);
2331         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2332             (mod_hash_val_t)&zone0);
2333         if (is_system_labeled()) {
2334                 zone0.zone_flags |= ZF_HASHED_LABEL;
2335                 (void) mod_hash_insert(zonehashbylabel,
2336                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2337         }
2338         mutex_exit(&zonehash_lock);
2339 
2340         /*
2341          * We avoid setting zone_kcred until now, since kcred is initialized
2342          * sometime after zone_zsd_init() and before zone_init().
2343          */
2344         zone0.zone_kcred = kcred;
2345         /*
2346          * The global zone is fully initialized (except for zone_rootvp which
2347          * will be set when the root filesystem is mounted).
2348          */
2349         global_zone = &zone0;
2350 
2351         /*
2352          * Setup an event channel to send zone status change notifications on
2353          */
2354         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2355             EVCH_CREAT);
2356 
2357         if (res)
2358                 panic("Sysevent_evc_bind failed during zone setup.\n");
2359 
2360 }
2361 
2362 static void
2363 zone_free(zone_t *zone)
2364 {
2365         ASSERT(zone != global_zone);
2366         ASSERT(zone->zone_ntasks == 0);
2367         ASSERT(zone->zone_nlwps == 0);
2368         ASSERT(zone->zone_nprocs == 0);
2369         ASSERT(zone->zone_cred_ref == 0);
2370         ASSERT(zone->zone_kcred == NULL);
2371         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2372             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2373         ASSERT(list_is_empty(&zone->zone_ref_list));
2374 
2375         /*
2376          * Remove any zone caps.
2377          */
2378         cpucaps_zone_remove(zone);
2379 
2380         ASSERT(zone->zone_cpucap == NULL);
2381 
2382         /* remove from deathrow list */
2383         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2384                 ASSERT(zone->zone_ref == 0);
2385                 mutex_enter(&zone_deathrow_lock);
2386                 list_remove(&zone_deathrow, zone);
2387                 mutex_exit(&zone_deathrow_lock);
2388         }
2389 
2390         list_destroy(&zone->zone_ref_list);
2391         zone_free_zsd(zone);
2392         zone_free_datasets(zone);
2393         list_destroy(&zone->zone_dl_list);
2394 
2395         cpu_uarray_free(zone->zone_ustate);
2396 
2397         if (zone->zone_rootvp != NULL)
2398                 VN_RELE(zone->zone_rootvp);
2399         if (zone->zone_rootpath)
2400                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2401         if (zone->zone_name != NULL)
2402                 kmem_free(zone->zone_name, ZONENAME_MAX);
2403         if (zone->zone_slabel != NULL)
2404                 label_rele(zone->zone_slabel);
2405         if (zone->zone_nodename != NULL)
2406                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2407         if (zone->zone_domain != NULL)
2408                 kmem_free(zone->zone_domain, _SYS_NMLN);
2409         if (zone->zone_privset != NULL)
2410                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2411         if (zone->zone_rctls != NULL)
2412                 rctl_set_free(zone->zone_rctls);
2413         if (zone->zone_bootargs != NULL)
2414                 strfree(zone->zone_bootargs);
2415         if (zone->zone_initname != NULL)
2416                 strfree(zone->zone_initname);
2417         if (zone->zone_fs_allowed != NULL)
2418                 strfree(zone->zone_fs_allowed);
2419         if (zone->zone_pfexecd != NULL)
2420                 klpd_freelist(&zone->zone_pfexecd);
2421         id_free(zoneid_space, zone->zone_id);
2422         mutex_destroy(&zone->zone_lock);
2423         cv_destroy(&zone->zone_cv);
2424         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2425         rw_destroy(&zone->zone_mntfs_db_lock);
2426         kmem_free(zone, sizeof (zone_t));
2427 }
2428 
2429 /*
2430  * See block comment at the top of this file for information about zone
2431  * status values.
2432  */
2433 /*
2434  * Convenience function for setting zone status.
2435  */
2436 static void
2437 zone_status_set(zone_t *zone, zone_status_t status)
2438 {
2439 
2440         nvlist_t *nvl = NULL;
2441         ASSERT(MUTEX_HELD(&zone_status_lock));
2442         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2443             status >= zone_status_get(zone));
2444 
2445         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2446             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2447             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2448             zone_status_table[status]) ||
2449             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2450             zone_status_table[zone->zone_status]) ||
2451             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2452             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2453             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2454             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2455 #ifdef DEBUG
2456                 (void) printf(
2457                     "Failed to allocate and send zone state change event.\n");
2458 #endif
2459         }
2460         nvlist_free(nvl);
2461 
2462         zone->zone_status = status;
2463 
2464         cv_broadcast(&zone->zone_cv);
2465 }
2466 
2467 /*
2468  * Public function to retrieve the zone status.  The zone status may
2469  * change after it is retrieved.
2470  */
2471 zone_status_t
2472 zone_status_get(zone_t *zone)
2473 {
2474         return (zone->zone_status);
2475 }
2476 
2477 static int
2478 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2479 {
2480         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2481         int err = 0;
2482 
2483         ASSERT(zone != global_zone);
2484         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2485                 goto done;      /* EFAULT or ENAMETOOLONG */
2486 
2487         if (zone->zone_bootargs != NULL)
2488                 strfree(zone->zone_bootargs);
2489 
2490         zone->zone_bootargs = strdup(buf);
2491 
2492 done:
2493         kmem_free(buf, BOOTARGS_MAX);
2494         return (err);
2495 }
2496 
2497 static int
2498 zone_set_brand(zone_t *zone, const char *brand)
2499 {
2500         struct brand_attr *attrp;
2501         brand_t *bp;
2502 
2503         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2504         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2505                 kmem_free(attrp, sizeof (struct brand_attr));
2506                 return (EFAULT);
2507         }
2508 
2509         bp = brand_register_zone(attrp);
2510         kmem_free(attrp, sizeof (struct brand_attr));
2511         if (bp == NULL)
2512                 return (EINVAL);
2513 
2514         /*
2515          * This is the only place where a zone can change it's brand.
2516          * We already need to hold zone_status_lock to check the zone
2517          * status, so we'll just use that lock to serialize zone
2518          * branding requests as well.
2519          */
2520         mutex_enter(&zone_status_lock);
2521 
2522         /* Re-Branding is not allowed and the zone can't be booted yet */
2523         if ((ZONE_IS_BRANDED(zone)) ||
2524             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2525                 mutex_exit(&zone_status_lock);
2526                 brand_unregister_zone(bp);
2527                 return (EINVAL);
2528         }
2529 
2530         /* set up the brand specific data */
2531         zone->zone_brand = bp;
2532         ZBROP(zone)->b_init_brand_data(zone);
2533 
2534         mutex_exit(&zone_status_lock);
2535         return (0);
2536 }
2537 
2538 static int
2539 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2540 {
2541         int err = 0;
2542         psecflags_t psf;
2543 
2544         ASSERT(zone != global_zone);
2545 
2546         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2547                 return (err);
2548 
2549         if (zone_status_get(zone) > ZONE_IS_READY)
2550                 return (EINVAL);
2551 
2552         if (!psecflags_validate(&psf))
2553                 return (EINVAL);
2554 
2555         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2556 
2557         /* Set security flags on the zone's zsched */
2558         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2559             sizeof (zone->zone_zsched->p_secflags));
2560 
2561         return (0);
2562 }
2563 
2564 static int
2565 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2566 {
2567         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2568         int err = 0;
2569 
2570         ASSERT(zone != global_zone);
2571         if ((err = copyinstr(zone_fs_allowed, buf,
2572             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2573                 goto done;
2574 
2575         if (zone->zone_fs_allowed != NULL)
2576                 strfree(zone->zone_fs_allowed);
2577 
2578         zone->zone_fs_allowed = strdup(buf);
2579 
2580 done:
2581         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2582         return (err);
2583 }
2584 
2585 static int
2586 zone_set_initname(zone_t *zone, const char *zone_initname)
2587 {
2588         char initname[INITNAME_SZ];
2589         size_t len;
2590         int err = 0;
2591 
2592         ASSERT(zone != global_zone);
2593         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2594                 return (err);   /* EFAULT or ENAMETOOLONG */
2595 
2596         if (zone->zone_initname != NULL)
2597                 strfree(zone->zone_initname);
2598 
2599         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2600         (void) strcpy(zone->zone_initname, initname);
2601         return (0);
2602 }
2603 
2604 static int
2605 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2606 {
2607         uint64_t mcap;
2608         int err = 0;
2609 
2610         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2611                 zone->zone_phys_mcap = mcap;
2612 
2613         return (err);
2614 }
2615 
2616 static int
2617 zone_set_sched_class(zone_t *zone, const char *new_class)
2618 {
2619         char sched_class[PC_CLNMSZ];
2620         id_t classid;
2621         int err;
2622 
2623         ASSERT(zone != global_zone);
2624         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2625                 return (err);   /* EFAULT or ENAMETOOLONG */
2626 
2627         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2628                 return (set_errno(EINVAL));
2629         zone->zone_defaultcid = classid;
2630         ASSERT(zone->zone_defaultcid > 0 &&
2631             zone->zone_defaultcid < loaded_classes);
2632 
2633         return (0);
2634 }
2635 
2636 /*
2637  * Block indefinitely waiting for (zone_status >= status)
2638  */
2639 void
2640 zone_status_wait(zone_t *zone, zone_status_t status)
2641 {
2642         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2643 
2644         mutex_enter(&zone_status_lock);
2645         while (zone->zone_status < status) {
2646                 cv_wait(&zone->zone_cv, &zone_status_lock);
2647         }
2648         mutex_exit(&zone_status_lock);
2649 }
2650 
2651 /*
2652  * Private CPR-safe version of zone_status_wait().
2653  */
2654 static void
2655 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2656 {
2657         callb_cpr_t cprinfo;
2658 
2659         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2660 
2661         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2662             str);
2663         mutex_enter(&zone_status_lock);
2664         while (zone->zone_status < status) {
2665                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2666                 cv_wait(&zone->zone_cv, &zone_status_lock);
2667                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2668         }
2669         /*
2670          * zone_status_lock is implicitly released by the following.
2671          */
2672         CALLB_CPR_EXIT(&cprinfo);
2673 }
2674 
2675 /*
2676  * Block until zone enters requested state or signal is received.  Return (0)
2677  * if signaled, non-zero otherwise.
2678  */
2679 int
2680 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2681 {
2682         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2683 
2684         mutex_enter(&zone_status_lock);
2685         while (zone->zone_status < status) {
2686                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2687                         mutex_exit(&zone_status_lock);
2688                         return (0);
2689                 }
2690         }
2691         mutex_exit(&zone_status_lock);
2692         return (1);
2693 }
2694 
2695 /*
2696  * Block until the zone enters the requested state or the timeout expires,
2697  * whichever happens first.  Return (-1) if operation timed out, time remaining
2698  * otherwise.
2699  */
2700 clock_t
2701 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2702 {
2703         clock_t timeleft = 0;
2704 
2705         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2706 
2707         mutex_enter(&zone_status_lock);
2708         while (zone->zone_status < status && timeleft != -1) {
2709                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2710         }
2711         mutex_exit(&zone_status_lock);
2712         return (timeleft);
2713 }
2714 
2715 /*
2716  * Block until the zone enters the requested state, the current process is
2717  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2718  * operation timed out, 0 if signaled, time remaining otherwise.
2719  */
2720 clock_t
2721 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2722 {
2723         clock_t timeleft = tim - ddi_get_lbolt();
2724 
2725         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2726 
2727         mutex_enter(&zone_status_lock);
2728         while (zone->zone_status < status) {
2729                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2730                     tim);
2731                 if (timeleft <= 0)
2732                         break;
2733         }
2734         mutex_exit(&zone_status_lock);
2735         return (timeleft);
2736 }
2737 
2738 /*
2739  * Zones have two reference counts: one for references from credential
2740  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2741  * This is so we can allow a zone to be rebooted while there are still
2742  * outstanding cred references, since certain drivers cache dblks (which
2743  * implicitly results in cached creds).  We wait for zone_ref to drop to
2744  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2745  * later freed when the zone_cred_ref drops to 0, though nothing other
2746  * than the zone id and privilege set should be accessed once the zone
2747  * is "dead".
2748  *
2749  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2750  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2751  * to 0.  This can be useful to flush out other sources of cached creds
2752  * that may be less innocuous than the driver case.
2753  *
2754  * Zones also provide a tracked reference counting mechanism in which zone
2755  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2756  * debuggers determine the sources of leaked zone references.  See
2757  * zone_hold_ref() and zone_rele_ref() below for more information.
2758  */
2759 
2760 int zone_wait_for_cred = 0;
2761 
2762 static void
2763 zone_hold_locked(zone_t *z)
2764 {
2765         ASSERT(MUTEX_HELD(&z->zone_lock));
2766         z->zone_ref++;
2767         ASSERT(z->zone_ref != 0);
2768 }
2769 
2770 /*
2771  * Increment the specified zone's reference count.  The zone's zone_t structure
2772  * will not be freed as long as the zone's reference count is nonzero.
2773  * Decrement the zone's reference count via zone_rele().
2774  *
2775  * NOTE: This function should only be used to hold zones for short periods of
2776  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2777  */
2778 void
2779 zone_hold(zone_t *z)
2780 {
2781         mutex_enter(&z->zone_lock);
2782         zone_hold_locked(z);
2783         mutex_exit(&z->zone_lock);
2784 }
2785 
2786 /*
2787  * If the non-cred ref count drops to 1 and either the cred ref count
2788  * is 0 or we aren't waiting for cred references, the zone is ready to
2789  * be destroyed.
2790  */
2791 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2792             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2793 
2794 /*
2795  * Common zone reference release function invoked by zone_rele() and
2796  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2797  * zone's subsystem-specific reference counters are not affected by the
2798  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2799  * removed from the specified zone's reference list.  ref must be non-NULL iff
2800  * subsys is not ZONE_REF_NUM_SUBSYS.
2801  */
2802 static void
2803 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2804 {
2805         boolean_t wakeup;
2806 
2807         mutex_enter(&z->zone_lock);
2808         ASSERT(z->zone_ref != 0);
2809         z->zone_ref--;
2810         if (subsys != ZONE_REF_NUM_SUBSYS) {
2811                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2812                 z->zone_subsys_ref[subsys]--;
2813                 list_remove(&z->zone_ref_list, ref);
2814         }
2815         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2816                 /* no more refs, free the structure */
2817                 mutex_exit(&z->zone_lock);
2818                 zone_free(z);
2819                 return;
2820         }
2821         /* signal zone_destroy so the zone can finish halting */
2822         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2823         mutex_exit(&z->zone_lock);
2824 
2825         if (wakeup) {
2826                 /*
2827                  * Grabbing zonehash_lock here effectively synchronizes with
2828                  * zone_destroy() to avoid missed signals.
2829                  */
2830                 mutex_enter(&zonehash_lock);
2831                 cv_broadcast(&zone_destroy_cv);
2832                 mutex_exit(&zonehash_lock);
2833         }
2834 }
2835 
2836 /*
2837  * Decrement the specified zone's reference count.  The specified zone will
2838  * cease to exist after this function returns if the reference count drops to
2839  * zero.  This function should be paired with zone_hold().
2840  */
2841 void
2842 zone_rele(zone_t *z)
2843 {
2844         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2845 }
2846 
2847 /*
2848  * Initialize a zone reference structure.  This function must be invoked for
2849  * a reference structure before the structure is passed to zone_hold_ref().
2850  */
2851 void
2852 zone_init_ref(zone_ref_t *ref)
2853 {
2854         ref->zref_zone = NULL;
2855         list_link_init(&ref->zref_linkage);
2856 }
2857 
2858 /*
2859  * Acquire a reference to zone z.  The caller must specify the
2860  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2861  * zone_ref_t structure will represent a reference to the specified zone.  Use
2862  * zone_rele_ref() to release the reference.
2863  *
2864  * The referenced zone_t structure will not be freed as long as the zone_t's
2865  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2866  * references.
2867  *
2868  * NOTE: The zone_ref_t structure must be initialized before it is used.
2869  * See zone_init_ref() above.
2870  */
2871 void
2872 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2873 {
2874         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2875 
2876         /*
2877          * Prevent consumers from reusing a reference structure before
2878          * releasing it.
2879          */
2880         VERIFY(ref->zref_zone == NULL);
2881 
2882         ref->zref_zone = z;
2883         mutex_enter(&z->zone_lock);
2884         zone_hold_locked(z);
2885         z->zone_subsys_ref[subsys]++;
2886         ASSERT(z->zone_subsys_ref[subsys] != 0);
2887         list_insert_head(&z->zone_ref_list, ref);
2888         mutex_exit(&z->zone_lock);
2889 }
2890 
2891 /*
2892  * Release the zone reference represented by the specified zone_ref_t.
2893  * The reference is invalid after it's released; however, the zone_ref_t
2894  * structure can be reused without having to invoke zone_init_ref().
2895  * subsys should be the same value that was passed to zone_hold_ref()
2896  * when the reference was acquired.
2897  */
2898 void
2899 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2900 {
2901         zone_rele_common(ref->zref_zone, ref, subsys);
2902 
2903         /*
2904          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2905          * when consumers dereference the reference.  This helps us catch
2906          * consumers who use released references.  Furthermore, this lets
2907          * consumers reuse the zone_ref_t structure without having to
2908          * invoke zone_init_ref().
2909          */
2910         ref->zref_zone = NULL;
2911 }
2912 
2913 void
2914 zone_cred_hold(zone_t *z)
2915 {
2916         mutex_enter(&z->zone_lock);
2917         z->zone_cred_ref++;
2918         ASSERT(z->zone_cred_ref != 0);
2919         mutex_exit(&z->zone_lock);
2920 }
2921 
2922 void
2923 zone_cred_rele(zone_t *z)
2924 {
2925         boolean_t wakeup;
2926 
2927         mutex_enter(&z->zone_lock);
2928         ASSERT(z->zone_cred_ref != 0);
2929         z->zone_cred_ref--;
2930         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2931                 /* no more refs, free the structure */
2932                 mutex_exit(&z->zone_lock);
2933                 zone_free(z);
2934                 return;
2935         }
2936         /*
2937          * If zone_destroy is waiting for the cred references to drain
2938          * out, and they have, signal it.
2939          */
2940         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2941             zone_status_get(z) >= ZONE_IS_DEAD);
2942         mutex_exit(&z->zone_lock);
2943 
2944         if (wakeup) {
2945                 /*
2946                  * Grabbing zonehash_lock here effectively synchronizes with
2947                  * zone_destroy() to avoid missed signals.
2948                  */
2949                 mutex_enter(&zonehash_lock);
2950                 cv_broadcast(&zone_destroy_cv);
2951                 mutex_exit(&zonehash_lock);
2952         }
2953 }
2954 
2955 void
2956 zone_task_hold(zone_t *z)
2957 {
2958         mutex_enter(&z->zone_lock);
2959         z->zone_ntasks++;
2960         ASSERT(z->zone_ntasks != 0);
2961         mutex_exit(&z->zone_lock);
2962 }
2963 
2964 void
2965 zone_task_rele(zone_t *zone)
2966 {
2967         uint_t refcnt;
2968 
2969         mutex_enter(&zone->zone_lock);
2970         ASSERT(zone->zone_ntasks != 0);
2971         refcnt = --zone->zone_ntasks;
2972         if (refcnt > 1)      {       /* Common case */
2973                 mutex_exit(&zone->zone_lock);
2974                 return;
2975         }
2976         zone_hold_locked(zone); /* so we can use the zone_t later */
2977         mutex_exit(&zone->zone_lock);
2978         if (refcnt == 1) {
2979                 /*
2980                  * See if the zone is shutting down.
2981                  */
2982                 mutex_enter(&zone_status_lock);
2983                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2984                         goto out;
2985                 }
2986 
2987                 /*
2988                  * Make sure the ntasks didn't change since we
2989                  * dropped zone_lock.
2990                  */
2991                 mutex_enter(&zone->zone_lock);
2992                 if (refcnt != zone->zone_ntasks) {
2993                         mutex_exit(&zone->zone_lock);
2994                         goto out;
2995                 }
2996                 mutex_exit(&zone->zone_lock);
2997 
2998                 /*
2999                  * No more user processes in the zone.  The zone is empty.
3000                  */
3001                 zone_status_set(zone, ZONE_IS_EMPTY);
3002                 goto out;
3003         }
3004 
3005         ASSERT(refcnt == 0);
3006         /*
3007          * zsched has exited; the zone is dead.
3008          */
3009         zone->zone_zsched = NULL;            /* paranoia */
3010         mutex_enter(&zone_status_lock);
3011         zone_status_set(zone, ZONE_IS_DEAD);
3012 out:
3013         mutex_exit(&zone_status_lock);
3014         zone_rele(zone);
3015 }
3016 
3017 zoneid_t
3018 getzoneid(void)
3019 {
3020         return (curproc->p_zone->zone_id);
3021 }
3022 
3023 /*
3024  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3025  * check the validity of a zone's state.
3026  */
3027 static zone_t *
3028 zone_find_all_by_id(zoneid_t zoneid)
3029 {
3030         mod_hash_val_t hv;
3031         zone_t *zone = NULL;
3032 
3033         ASSERT(MUTEX_HELD(&zonehash_lock));
3034 
3035         if (mod_hash_find(zonehashbyid,
3036             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3037                 zone = (zone_t *)hv;
3038         return (zone);
3039 }
3040 
3041 static zone_t *
3042 zone_find_all_by_label(const ts_label_t *label)
3043 {
3044         mod_hash_val_t hv;
3045         zone_t *zone = NULL;
3046 
3047         ASSERT(MUTEX_HELD(&zonehash_lock));
3048 
3049         /*
3050          * zonehashbylabel is not maintained for unlabeled systems
3051          */
3052         if (!is_system_labeled())
3053                 return (NULL);
3054         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3055                 zone = (zone_t *)hv;
3056         return (zone);
3057 }
3058 
3059 static zone_t *
3060 zone_find_all_by_name(char *name)
3061 {
3062         mod_hash_val_t hv;
3063         zone_t *zone = NULL;
3064 
3065         ASSERT(MUTEX_HELD(&zonehash_lock));
3066 
3067         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3068                 zone = (zone_t *)hv;
3069         return (zone);
3070 }
3071 
3072 /*
3073  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3074  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3075  * Caller must call zone_rele() once it is done with the zone.
3076  *
3077  * The zone may begin the zone_destroy() sequence immediately after this
3078  * function returns, but may be safely used until zone_rele() is called.
3079  */
3080 zone_t *
3081 zone_find_by_id(zoneid_t zoneid)
3082 {
3083         zone_t *zone;
3084         zone_status_t status;
3085 
3086         mutex_enter(&zonehash_lock);
3087         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3088                 mutex_exit(&zonehash_lock);
3089                 return (NULL);
3090         }
3091         status = zone_status_get(zone);
3092         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3093                 /*
3094                  * For all practical purposes the zone doesn't exist.
3095                  */
3096                 mutex_exit(&zonehash_lock);
3097                 return (NULL);
3098         }
3099         zone_hold(zone);
3100         mutex_exit(&zonehash_lock);
3101         return (zone);
3102 }
3103 
3104 /*
3105  * Similar to zone_find_by_id, but using zone label as the key.
3106  */
3107 zone_t *
3108 zone_find_by_label(const ts_label_t *label)
3109 {
3110         zone_t *zone;
3111         zone_status_t status;
3112 
3113         mutex_enter(&zonehash_lock);
3114         if ((zone = zone_find_all_by_label(label)) == NULL) {
3115                 mutex_exit(&zonehash_lock);
3116                 return (NULL);
3117         }
3118 
3119         status = zone_status_get(zone);
3120         if (status > ZONE_IS_DOWN) {
3121                 /*
3122                  * For all practical purposes the zone doesn't exist.
3123                  */
3124                 mutex_exit(&zonehash_lock);
3125                 return (NULL);
3126         }
3127         zone_hold(zone);
3128         mutex_exit(&zonehash_lock);
3129         return (zone);
3130 }
3131 
3132 /*
3133  * Similar to zone_find_by_id, but using zone name as the key.
3134  */
3135 zone_t *
3136 zone_find_by_name(char *name)
3137 {
3138         zone_t *zone;
3139         zone_status_t status;
3140 
3141         mutex_enter(&zonehash_lock);
3142         if ((zone = zone_find_all_by_name(name)) == NULL) {
3143                 mutex_exit(&zonehash_lock);
3144                 return (NULL);
3145         }
3146         status = zone_status_get(zone);
3147         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3148                 /*
3149                  * For all practical purposes the zone doesn't exist.
3150                  */
3151                 mutex_exit(&zonehash_lock);
3152                 return (NULL);
3153         }
3154         zone_hold(zone);
3155         mutex_exit(&zonehash_lock);
3156         return (zone);
3157 }
3158 
3159 /*
3160  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3161  * if there is a zone "foo" rooted at /foo/root, and the path argument
3162  * is "/foo/root/proc", it will return the held zone_t corresponding to
3163  * zone "foo".
3164  *
3165  * zone_find_by_path() always returns a non-NULL value, since at the
3166  * very least every path will be contained in the global zone.
3167  *
3168  * As with the other zone_find_by_*() functions, the caller is
3169  * responsible for zone_rele()ing the return value of this function.
3170  */
3171 zone_t *
3172 zone_find_by_path(const char *path)
3173 {
3174         zone_t *zone;
3175         zone_t *zret = NULL;
3176         zone_status_t status;
3177 
3178         if (path == NULL) {
3179                 /*
3180                  * Call from rootconf().
3181                  */
3182                 zone_hold(global_zone);
3183                 return (global_zone);
3184         }
3185         ASSERT(*path == '/');
3186         mutex_enter(&zonehash_lock);
3187         for (zone = list_head(&zone_active); zone != NULL;
3188             zone = list_next(&zone_active, zone)) {
3189                 if (ZONE_PATH_VISIBLE(path, zone))
3190                         zret = zone;
3191         }
3192         ASSERT(zret != NULL);
3193         status = zone_status_get(zret);
3194         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3195                 /*
3196                  * Zone practically doesn't exist.
3197                  */
3198                 zret = global_zone;
3199         }
3200         zone_hold(zret);
3201         mutex_exit(&zonehash_lock);
3202         return (zret);
3203 }
3204 
3205 /*
3206  * Public interface for updating per-zone load averages.  Called once per
3207  * second.
3208  *
3209  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3210  */
3211 void
3212 zone_loadavg_update(void)
3213 {
3214         zone_t *zp;
3215         zone_status_t status;
3216         struct loadavg_s *lavg;
3217         hrtime_t zone_total;
3218         uint64_t tmp;
3219         int i;
3220         hrtime_t hr_avg;
3221         int nrun;
3222         static int64_t f[3] = { 135, 27, 9 };
3223         int64_t q, r;
3224 
3225         mutex_enter(&zonehash_lock);
3226         for (zp = list_head(&zone_active); zp != NULL;
3227             zp = list_next(&zone_active, zp)) {
3228                 mutex_enter(&zp->zone_lock);
3229 
3230                 /* Skip zones that are on the way down or not yet up */
3231                 status = zone_status_get(zp);
3232                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3233                         /* For all practical purposes the zone doesn't exist. */
3234                         mutex_exit(&zp->zone_lock);
3235                         continue;
3236                 }
3237 
3238                 /*
3239                  * Update the 10 second moving average data in zone_loadavg.
3240                  */
3241                 lavg = &zp->zone_loadavg;
3242 
3243                 tmp = cpu_uarray_sum_all(zp->zone_ustate);
3244                 zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3245 
3246                 scalehrtime(&zone_total);
3247 
3248                 /* The zone_total should always be increasing. */
3249                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3250                     zone_total - lavg->lg_total : 0;
3251                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3252                 /* lg_total holds the prev. 1 sec. total */
3253                 lavg->lg_total = zone_total;
3254 
3255                 /*
3256                  * To simplify the calculation, we don't calculate the load avg.
3257                  * until the zone has been up for at least 10 seconds and our
3258                  * moving average is thus full.
3259                  */
3260                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3261                         lavg->lg_len++;
3262                         mutex_exit(&zp->zone_lock);
3263                         continue;
3264                 }
3265 
3266                 /* Now calculate the 1min, 5min, 15 min load avg. */
3267                 hr_avg = 0;
3268                 for (i = 0; i < S_LOADAVG_SZ; i++)
3269                         hr_avg += lavg->lg_loads[i];
3270                 hr_avg = hr_avg / S_LOADAVG_SZ;
3271                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3272 
3273                 /* Compute load avg. See comment in calcloadavg() */
3274                 for (i = 0; i < 3; i++) {
3275                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3276                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3277                         zp->zone_hp_avenrun[i] +=
3278                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3279 
3280                         /* avenrun[] can only hold 31 bits of load avg. */
3281                         if (zp->zone_hp_avenrun[i] <
3282                             ((uint64_t)1<<(31+16-FSHIFT)))
3283                                 zp->zone_avenrun[i] = (int32_t)
3284                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3285                         else
3286                                 zp->zone_avenrun[i] = 0x7fffffff;
3287                 }
3288 
3289                 mutex_exit(&zp->zone_lock);
3290         }
3291         mutex_exit(&zonehash_lock);
3292 }
3293 
3294 /*
3295  * Get the number of cpus visible to this zone.  The system-wide global
3296  * 'ncpus' is returned if pools are disabled, the caller is in the
3297  * global zone, or a NULL zone argument is passed in.
3298  */
3299 int
3300 zone_ncpus_get(zone_t *zone)
3301 {
3302         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3303 
3304         return (myncpus != 0 ? myncpus : ncpus);
3305 }
3306 
3307 /*
3308  * Get the number of online cpus visible to this zone.  The system-wide
3309  * global 'ncpus_online' is returned if pools are disabled, the caller
3310  * is in the global zone, or a NULL zone argument is passed in.
3311  */
3312 int
3313 zone_ncpus_online_get(zone_t *zone)
3314 {
3315         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3316 
3317         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3318 }
3319 
3320 /*
3321  * Return the pool to which the zone is currently bound.
3322  */
3323 pool_t *
3324 zone_pool_get(zone_t *zone)
3325 {
3326         ASSERT(pool_lock_held());
3327 
3328         return (zone->zone_pool);
3329 }
3330 
3331 /*
3332  * Set the zone's pool pointer and update the zone's visibility to match
3333  * the resources in the new pool.
3334  */
3335 void
3336 zone_pool_set(zone_t *zone, pool_t *pool)
3337 {
3338         ASSERT(pool_lock_held());
3339         ASSERT(MUTEX_HELD(&cpu_lock));
3340 
3341         zone->zone_pool = pool;
3342         zone_pset_set(zone, pool->pool_pset->pset_id);
3343 }
3344 
3345 /*
3346  * Return the cached value of the id of the processor set to which the
3347  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3348  * facility is disabled.
3349  */
3350 psetid_t
3351 zone_pset_get(zone_t *zone)
3352 {
3353         ASSERT(MUTEX_HELD(&cpu_lock));
3354 
3355         return (zone->zone_psetid);
3356 }
3357 
3358 /*
3359  * Set the cached value of the id of the processor set to which the zone
3360  * is currently bound.  Also update the zone's visibility to match the
3361  * resources in the new processor set.
3362  */
3363 void
3364 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3365 {
3366         psetid_t oldpsetid;
3367 
3368         ASSERT(MUTEX_HELD(&cpu_lock));
3369         oldpsetid = zone_pset_get(zone);
3370 
3371         if (oldpsetid == newpsetid)
3372                 return;
3373         /*
3374          * Global zone sees all.
3375          */
3376         if (zone != global_zone) {
3377                 zone->zone_psetid = newpsetid;
3378                 if (newpsetid != ZONE_PS_INVAL)
3379                         pool_pset_visibility_add(newpsetid, zone);
3380                 if (oldpsetid != ZONE_PS_INVAL)
3381                         pool_pset_visibility_remove(oldpsetid, zone);
3382         }
3383         /*
3384          * Disabling pools, so we should start using the global values
3385          * for ncpus and ncpus_online.
3386          */
3387         if (newpsetid == ZONE_PS_INVAL) {
3388                 zone->zone_ncpus = 0;
3389                 zone->zone_ncpus_online = 0;
3390         }
3391 }
3392 
3393 /*
3394  * Walk the list of active zones and issue the provided callback for
3395  * each of them.
3396  *
3397  * Caller must not be holding any locks that may be acquired under
3398  * zonehash_lock.  See comment at the beginning of the file for a list of
3399  * common locks and their interactions with zones.
3400  */
3401 int
3402 zone_walk(int (*cb)(zone_t *, void *), void *data)
3403 {
3404         zone_t *zone;
3405         int ret = 0;
3406         zone_status_t status;
3407 
3408         mutex_enter(&zonehash_lock);
3409         for (zone = list_head(&zone_active); zone != NULL;
3410             zone = list_next(&zone_active, zone)) {
3411                 /*
3412                  * Skip zones that shouldn't be externally visible.
3413                  */
3414                 status = zone_status_get(zone);
3415                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3416                         continue;
3417                 /*
3418                  * Bail immediately if any callback invocation returns a
3419                  * non-zero value.
3420                  */
3421                 ret = (*cb)(zone, data);
3422                 if (ret != 0)
3423                         break;
3424         }
3425         mutex_exit(&zonehash_lock);
3426         return (ret);
3427 }
3428 
3429 static int
3430 zone_set_root(zone_t *zone, const char *upath)
3431 {
3432         vnode_t *vp;
3433         int trycount;
3434         int error = 0;
3435         char *path;
3436         struct pathname upn, pn;
3437         size_t pathlen;
3438 
3439         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3440                 return (error);
3441 
3442         pn_alloc(&pn);
3443 
3444         /* prevent infinite loop */
3445         trycount = 10;
3446         for (;;) {
3447                 if (--trycount <= 0) {
3448                         error = ESTALE;
3449                         goto out;
3450                 }
3451 
3452                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3453                         /*
3454                          * VOP_ACCESS() may cover 'vp' with a new
3455                          * filesystem, if 'vp' is an autoFS vnode.
3456                          * Get the new 'vp' if so.
3457                          */
3458                         if ((error =
3459                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3460                             (!vn_ismntpt(vp) ||
3461                             (error = traverse(&vp)) == 0)) {
3462                                 pathlen = pn.pn_pathlen + 2;
3463                                 path = kmem_alloc(pathlen, KM_SLEEP);
3464                                 (void) strncpy(path, pn.pn_path,
3465                                     pn.pn_pathlen + 1);
3466                                 path[pathlen - 2] = '/';
3467                                 path[pathlen - 1] = '\0';
3468                                 pn_free(&pn);
3469                                 pn_free(&upn);
3470 
3471                                 /* Success! */
3472                                 break;
3473                         }
3474                         VN_RELE(vp);
3475                 }
3476                 if (error != ESTALE)
3477                         goto out;
3478         }
3479 
3480         ASSERT(error == 0);
3481         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3482         zone->zone_rootpath = path;
3483         zone->zone_rootpathlen = pathlen;
3484         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3485                 zone->zone_flags |= ZF_IS_SCRATCH;
3486         return (0);
3487 
3488 out:
3489         pn_free(&pn);
3490         pn_free(&upn);
3491         return (error);
3492 }
3493 
3494 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3495                         ((c) >= 'a' && (c) <= 'z') || \
3496                         ((c) >= 'A' && (c) <= 'Z'))
3497 
3498 static int
3499 zone_set_name(zone_t *zone, const char *uname)
3500 {
3501         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3502         size_t len;
3503         int i, err;
3504 
3505         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3506                 kmem_free(kname, ZONENAME_MAX);
3507                 return (err);   /* EFAULT or ENAMETOOLONG */
3508         }
3509 
3510         /* must be less than ZONENAME_MAX */
3511         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3512                 kmem_free(kname, ZONENAME_MAX);
3513                 return (EINVAL);
3514         }
3515 
3516         /*
3517          * Name must start with an alphanumeric and must contain only
3518          * alphanumerics, '-', '_' and '.'.
3519          */
3520         if (!isalnum(kname[0])) {
3521                 kmem_free(kname, ZONENAME_MAX);
3522                 return (EINVAL);
3523         }
3524         for (i = 1; i < len - 1; i++) {
3525                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3526                     kname[i] != '.') {
3527                         kmem_free(kname, ZONENAME_MAX);
3528                         return (EINVAL);
3529                 }
3530         }
3531 
3532         zone->zone_name = kname;
3533         return (0);
3534 }
3535 
3536 /*
3537  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3538  * is NULL or it points to a zone with no hostid emulation, then the machine's
3539  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3540  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3541  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3542  * hostid and the machine's hostid is invalid.
3543  */
3544 uint32_t
3545 zone_get_hostid(zone_t *zonep)
3546 {
3547         unsigned long machine_hostid;
3548 
3549         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3550                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3551                         return (HW_INVALID_HOSTID);
3552                 return ((uint32_t)machine_hostid);
3553         }
3554         return (zonep->zone_hostid);
3555 }
3556 
3557 /*
3558  * Similar to thread_create(), but makes sure the thread is in the appropriate
3559  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3560  */
3561 /*ARGSUSED*/
3562 kthread_t *
3563 zthread_create(
3564     caddr_t stk,
3565     size_t stksize,
3566     void (*proc)(),
3567     void *arg,
3568     size_t len,
3569     pri_t pri)
3570 {
3571         kthread_t *t;
3572         zone_t *zone = curproc->p_zone;
3573         proc_t *pp = zone->zone_zsched;
3574 
3575         zone_hold(zone);        /* Reference to be dropped when thread exits */
3576 
3577         /*
3578          * No-one should be trying to create threads if the zone is shutting
3579          * down and there aren't any kernel threads around.  See comment
3580          * in zthread_exit().
3581          */
3582         ASSERT(!(zone->zone_kthreads == NULL &&
3583             zone_status_get(zone) >= ZONE_IS_EMPTY));
3584         /*
3585          * Create a thread, but don't let it run until we've finished setting
3586          * things up.
3587          */
3588         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3589         ASSERT(t->t_forw == NULL);
3590         mutex_enter(&zone_status_lock);
3591         if (zone->zone_kthreads == NULL) {
3592                 t->t_forw = t->t_back = t;
3593         } else {
3594                 kthread_t *tx = zone->zone_kthreads;
3595 
3596                 t->t_forw = tx;
3597                 t->t_back = tx->t_back;
3598                 tx->t_back->t_forw = t;
3599                 tx->t_back = t;
3600         }
3601         zone->zone_kthreads = t;
3602         mutex_exit(&zone_status_lock);
3603 
3604         mutex_enter(&pp->p_lock);
3605         t->t_proc_flag |= TP_ZTHREAD;
3606         project_rele(t->t_proj);
3607         t->t_proj = project_hold(pp->p_task->tk_proj);
3608 
3609         /*
3610          * Setup complete, let it run.
3611          */
3612         thread_lock(t);
3613         t->t_schedflag |= TS_ALLSTART;
3614         setrun_locked(t);
3615         thread_unlock(t);
3616 
3617         mutex_exit(&pp->p_lock);
3618 
3619         return (t);
3620 }
3621 
3622 /*
3623  * Similar to thread_exit().  Must be called by threads created via
3624  * zthread_exit().
3625  */
3626 void
3627 zthread_exit(void)
3628 {
3629         kthread_t *t = curthread;
3630         proc_t *pp = curproc;
3631         zone_t *zone = pp->p_zone;
3632 
3633         mutex_enter(&zone_status_lock);
3634 
3635         /*
3636          * Reparent to p0
3637          */
3638         kpreempt_disable();
3639         mutex_enter(&pp->p_lock);
3640         t->t_proc_flag &= ~TP_ZTHREAD;
3641         t->t_procp = &p0;
3642         hat_thread_exit(t);
3643         mutex_exit(&pp->p_lock);
3644         kpreempt_enable();
3645 
3646         if (t->t_back == t) {
3647                 ASSERT(t->t_forw == t);
3648                 /*
3649                  * If the zone is empty, once the thread count
3650                  * goes to zero no further kernel threads can be
3651                  * created.  This is because if the creator is a process
3652                  * in the zone, then it must have exited before the zone
3653                  * state could be set to ZONE_IS_EMPTY.
3654                  * Otherwise, if the creator is a kernel thread in the
3655                  * zone, the thread count is non-zero.
3656                  *
3657                  * This really means that non-zone kernel threads should
3658                  * not create zone kernel threads.
3659                  */
3660                 zone->zone_kthreads = NULL;
3661                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3662                         zone_status_set(zone, ZONE_IS_DOWN);
3663                         /*
3664                          * Remove any CPU caps on this zone.
3665                          */
3666                         cpucaps_zone_remove(zone);
3667                 }
3668         } else {
3669                 t->t_forw->t_back = t->t_back;
3670                 t->t_back->t_forw = t->t_forw;
3671                 if (zone->zone_kthreads == t)
3672                         zone->zone_kthreads = t->t_forw;
3673         }
3674         mutex_exit(&zone_status_lock);
3675         zone_rele(zone);
3676         thread_exit();
3677         /* NOTREACHED */
3678 }
3679 
3680 static void
3681 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3682 {
3683         vnode_t *oldvp;
3684 
3685         /* we're going to hold a reference here to the directory */
3686         VN_HOLD(vp);
3687 
3688         /* update abs cwd/root path see c2/audit.c */
3689         if (AU_AUDITING())
3690                 audit_chdirec(vp, vpp);
3691 
3692         mutex_enter(&pp->p_lock);
3693         oldvp = *vpp;
3694         *vpp = vp;
3695         mutex_exit(&pp->p_lock);
3696         if (oldvp != NULL)
3697                 VN_RELE(oldvp);
3698 }
3699 
3700 /*
3701  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3702  */
3703 static int
3704 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3705 {
3706         nvpair_t *nvp = NULL;
3707         boolean_t priv_set = B_FALSE;
3708         boolean_t limit_set = B_FALSE;
3709         boolean_t action_set = B_FALSE;
3710 
3711         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3712                 const char *name;
3713                 uint64_t ui64;
3714 
3715                 name = nvpair_name(nvp);
3716                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3717                         return (EINVAL);
3718                 (void) nvpair_value_uint64(nvp, &ui64);
3719                 if (strcmp(name, "privilege") == 0) {
3720                         /*
3721                          * Currently only privileged values are allowed, but
3722                          * this may change in the future.
3723                          */
3724                         if (ui64 != RCPRIV_PRIVILEGED)
3725                                 return (EINVAL);
3726                         rv->rcv_privilege = ui64;
3727                         priv_set = B_TRUE;
3728                 } else if (strcmp(name, "limit") == 0) {
3729                         rv->rcv_value = ui64;
3730                         limit_set = B_TRUE;
3731                 } else if (strcmp(name, "action") == 0) {
3732                         if (ui64 != RCTL_LOCAL_NOACTION &&
3733                             ui64 != RCTL_LOCAL_DENY)
3734                                 return (EINVAL);
3735                         rv->rcv_flagaction = ui64;
3736                         action_set = B_TRUE;
3737                 } else {
3738                         return (EINVAL);
3739                 }
3740         }
3741 
3742         if (!(priv_set && limit_set && action_set))
3743                 return (EINVAL);
3744         rv->rcv_action_signal = 0;
3745         rv->rcv_action_recipient = NULL;
3746         rv->rcv_action_recip_pid = -1;
3747         rv->rcv_firing_time = 0;
3748 
3749         return (0);
3750 }
3751 
3752 /*
3753  * Non-global zone version of start_init.
3754  */
3755 void
3756 zone_start_init(void)
3757 {
3758         proc_t *p = ttoproc(curthread);
3759         zone_t *z = p->p_zone;
3760 
3761         ASSERT(!INGLOBALZONE(curproc));
3762 
3763         /*
3764          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3765          * storing just the pid of init is sufficient.
3766          */
3767         z->zone_proc_initpid = p->p_pid;
3768 
3769         /*
3770          * We maintain zone_boot_err so that we can return the cause of the
3771          * failure back to the caller of the zone_boot syscall.
3772          */
3773         p->p_zone->zone_boot_err = start_init_common();
3774 
3775         /*
3776          * We will prevent booting zones from becoming running zones if the
3777          * global zone is shutting down.
3778          */
3779         mutex_enter(&zone_status_lock);
3780         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3781             ZONE_IS_SHUTTING_DOWN) {
3782                 /*
3783                  * Make sure we are still in the booting state-- we could have
3784                  * raced and already be shutting down, or even further along.
3785                  */
3786                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3787                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3788                 }
3789                 mutex_exit(&zone_status_lock);
3790                 /* It's gone bad, dispose of the process */
3791                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3792                         mutex_enter(&p->p_lock);
3793                         ASSERT(p->p_flag & SEXITLWPS);
3794                         lwp_exit();
3795                 }
3796         } else {
3797                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3798                         zone_status_set(z, ZONE_IS_RUNNING);
3799                 mutex_exit(&zone_status_lock);
3800                 /* cause the process to return to userland. */
3801                 lwp_rtt();
3802         }
3803 }
3804 
3805 struct zsched_arg {
3806         zone_t *zone;
3807         nvlist_t *nvlist;
3808 };
3809 
3810 /*
3811  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3812  * anything to do with scheduling, but rather with the fact that
3813  * per-zone kernel threads are parented to zsched, just like regular
3814  * kernel threads are parented to sched (p0).
3815  *
3816  * zsched is also responsible for launching init for the zone.
3817  */
3818 static void
3819 zsched(void *arg)
3820 {
3821         struct zsched_arg *za = arg;
3822         proc_t *pp = curproc;
3823         proc_t *initp = proc_init;
3824         zone_t *zone = za->zone;
3825         cred_t *cr, *oldcred;
3826         rctl_set_t *set;
3827         rctl_alloc_gp_t *gp;
3828         contract_t *ct = NULL;
3829         task_t *tk, *oldtk;
3830         rctl_entity_p_t e;
3831         kproject_t *pj;
3832 
3833         nvlist_t *nvl = za->nvlist;
3834         nvpair_t *nvp = NULL;
3835 
3836         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3837         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3838         PTOU(pp)->u_argc = 0;
3839         PTOU(pp)->u_argv = 0;
3840         PTOU(pp)->u_envp = 0;
3841         PTOU(pp)->u_commpagep = 0;
3842         closeall(P_FINFO(pp));
3843 
3844         /*
3845          * We are this zone's "zsched" process.  As the zone isn't generally
3846          * visible yet we don't need to grab any locks before initializing its
3847          * zone_proc pointer.
3848          */
3849         zone_hold(zone);  /* this hold is released by zone_destroy() */
3850         zone->zone_zsched = pp;
3851         mutex_enter(&pp->p_lock);
3852         pp->p_zone = zone;
3853         mutex_exit(&pp->p_lock);
3854 
3855         /*
3856          * Disassociate process from its 'parent'; parent ourselves to init
3857          * (pid 1) and change other values as needed.
3858          */
3859         sess_create();
3860 
3861         mutex_enter(&pidlock);
3862         proc_detach(pp);
3863         pp->p_ppid = 1;
3864         pp->p_flag |= SZONETOP;
3865         pp->p_ancpid = 1;
3866         pp->p_parent = initp;
3867         pp->p_psibling = NULL;
3868         if (initp->p_child)
3869                 initp->p_child->p_psibling = pp;
3870         pp->p_sibling = initp->p_child;
3871         initp->p_child = pp;
3872 
3873         /* Decrement what newproc() incremented. */
3874         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3875         /*
3876          * Our credentials are about to become kcred-like, so we don't care
3877          * about the caller's ruid.
3878          */
3879         upcount_inc(crgetruid(kcred), zone->zone_id);
3880         mutex_exit(&pidlock);
3881 
3882         /*
3883          * getting out of global zone, so decrement lwp and process counts
3884          */
3885         pj = pp->p_task->tk_proj;
3886         mutex_enter(&global_zone->zone_nlwps_lock);
3887         pj->kpj_nlwps -= pp->p_lwpcnt;
3888         global_zone->zone_nlwps -= pp->p_lwpcnt;
3889         pj->kpj_nprocs--;
3890         global_zone->zone_nprocs--;
3891         mutex_exit(&global_zone->zone_nlwps_lock);
3892 
3893         /*
3894          * Decrement locked memory counts on old zone and project.
3895          */
3896         mutex_enter(&global_zone->zone_mem_lock);
3897         global_zone->zone_locked_mem -= pp->p_locked_mem;
3898         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3899         mutex_exit(&global_zone->zone_mem_lock);
3900 
3901         /*
3902          * Create and join a new task in project '0' of this zone.
3903          *
3904          * We don't need to call holdlwps() since we know we're the only lwp in
3905          * this process.
3906          *
3907          * task_join() returns with p_lock held.
3908          */
3909         tk = task_create(0, zone);
3910         mutex_enter(&cpu_lock);
3911         oldtk = task_join(tk, 0);
3912 
3913         pj = pp->p_task->tk_proj;
3914 
3915         mutex_enter(&zone->zone_mem_lock);
3916         zone->zone_locked_mem += pp->p_locked_mem;
3917         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3918         mutex_exit(&zone->zone_mem_lock);
3919 
3920         /*
3921          * add lwp and process counts to zsched's zone, and increment
3922          * project's task and process count due to the task created in
3923          * the above task_create.
3924          */
3925         mutex_enter(&zone->zone_nlwps_lock);
3926         pj->kpj_nlwps += pp->p_lwpcnt;
3927         pj->kpj_ntasks += 1;
3928         zone->zone_nlwps += pp->p_lwpcnt;
3929         pj->kpj_nprocs++;
3930         zone->zone_nprocs++;
3931         mutex_exit(&zone->zone_nlwps_lock);
3932 
3933         mutex_exit(&curproc->p_lock);
3934         mutex_exit(&cpu_lock);
3935         task_rele(oldtk);
3936 
3937         /*
3938          * The process was created by a process in the global zone, hence the
3939          * credentials are wrong.  We might as well have kcred-ish credentials.
3940          */
3941         cr = zone->zone_kcred;
3942         crhold(cr);
3943         mutex_enter(&pp->p_crlock);
3944         oldcred = pp->p_cred;
3945         pp->p_cred = cr;
3946         mutex_exit(&pp->p_crlock);
3947         crfree(oldcred);
3948 
3949         /*
3950          * Hold credentials again (for thread)
3951          */
3952         crhold(cr);
3953 
3954         /*
3955          * p_lwpcnt can't change since this is a kernel process.
3956          */
3957         crset(pp, cr);
3958 
3959         /*
3960          * Chroot
3961          */
3962         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3963         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3964 
3965         /*
3966          * Initialize zone's rctl set.
3967          */
3968         set = rctl_set_create();
3969         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3970         mutex_enter(&pp->p_lock);
3971         e.rcep_p.zone = zone;
3972         e.rcep_t = RCENTITY_ZONE;
3973         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3974         mutex_exit(&pp->p_lock);
3975         rctl_prealloc_destroy(gp);
3976 
3977         /*
3978          * Apply the rctls passed in to zone_create().  This is basically a list
3979          * assignment: all of the old values are removed and the new ones
3980          * inserted.  That is, if an empty list is passed in, all values are
3981          * removed.
3982          */
3983         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3984                 rctl_dict_entry_t *rde;
3985                 rctl_hndl_t hndl;
3986                 char *name;
3987                 nvlist_t **nvlarray;
3988                 uint_t i, nelem;
3989                 int error;      /* For ASSERT()s */
3990 
3991                 name = nvpair_name(nvp);
3992                 hndl = rctl_hndl_lookup(name);
3993                 ASSERT(hndl != -1);
3994                 rde = rctl_dict_lookup_hndl(hndl);
3995                 ASSERT(rde != NULL);
3996 
3997                 for (; /* ever */; ) {
3998                         rctl_val_t oval;
3999 
4000                         mutex_enter(&pp->p_lock);
4001                         error = rctl_local_get(hndl, NULL, &oval, pp);
4002                         mutex_exit(&pp->p_lock);
4003                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4004                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4005                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4006                                 break;
4007                         mutex_enter(&pp->p_lock);
4008                         error = rctl_local_delete(hndl, &oval, pp);
4009                         mutex_exit(&pp->p_lock);
4010                         ASSERT(error == 0);
4011                 }
4012                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4013                 ASSERT(error == 0);
4014                 for (i = 0; i < nelem; i++) {
4015                         rctl_val_t *nvalp;
4016 
4017                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4018                         error = nvlist2rctlval(nvlarray[i], nvalp);
4019                         ASSERT(error == 0);
4020                         /*
4021                          * rctl_local_insert can fail if the value being
4022                          * inserted is a duplicate; this is OK.
4023                          */
4024                         mutex_enter(&pp->p_lock);
4025                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4026                                 kmem_cache_free(rctl_val_cache, nvalp);
4027                         mutex_exit(&pp->p_lock);
4028                 }
4029         }
4030 
4031         /*
4032          * Tell the world that we're done setting up.
4033          *
4034          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4035          * and atomically set the zone's processor set visibility.  Once
4036          * we drop pool_lock() this zone will automatically get updated
4037          * to reflect any future changes to the pools configuration.
4038          *
4039          * Note that after we drop the locks below (zonehash_lock in
4040          * particular) other operations such as a zone_getattr call can
4041          * now proceed and observe the zone. That is the reason for doing a
4042          * state transition to the INITIALIZED state.
4043          */
4044         pool_lock();
4045         mutex_enter(&cpu_lock);
4046         mutex_enter(&zonehash_lock);
4047         zone_uniqid(zone);
4048         zone_zsd_configure(zone);
4049         if (pool_state == POOL_ENABLED)
4050                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4051         mutex_enter(&zone_status_lock);
4052         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4053         zone_status_set(zone, ZONE_IS_INITIALIZED);
4054         mutex_exit(&zone_status_lock);
4055         mutex_exit(&zonehash_lock);
4056         mutex_exit(&cpu_lock);
4057         pool_unlock();
4058 
4059         /* Now call the create callback for this key */
4060         zsd_apply_all_keys(zsd_apply_create, zone);
4061 
4062         /* The callbacks are complete. Mark ZONE_IS_READY */
4063         mutex_enter(&zone_status_lock);
4064         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4065         zone_status_set(zone, ZONE_IS_READY);
4066         mutex_exit(&zone_status_lock);
4067 
4068         /*
4069          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4070          * we launch init, and set the state to running.
4071          */
4072         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4073 
4074         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4075                 id_t cid;
4076 
4077                 /*
4078                  * Ok, this is a little complicated.  We need to grab the
4079                  * zone's pool's scheduling class ID; note that by now, we
4080                  * are already bound to a pool if we need to be (zoneadmd
4081                  * will have done that to us while we're in the READY
4082                  * state).  *But* the scheduling class for the zone's 'init'
4083                  * must be explicitly passed to newproc, which doesn't
4084                  * respect pool bindings.
4085                  *
4086                  * We hold the pool_lock across the call to newproc() to
4087                  * close the obvious race: the pool's scheduling class
4088                  * could change before we manage to create the LWP with
4089                  * classid 'cid'.
4090                  */
4091                 pool_lock();
4092                 if (zone->zone_defaultcid > 0)
4093                         cid = zone->zone_defaultcid;
4094                 else
4095                         cid = pool_get_class(zone->zone_pool);
4096                 if (cid == -1)
4097                         cid = defaultcid;
4098 
4099                 /*
4100                  * If this fails, zone_boot will ultimately fail.  The
4101                  * state of the zone will be set to SHUTTING_DOWN-- userland
4102                  * will have to tear down the zone, and fail, or try again.
4103                  */
4104                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4105                     minclsyspri - 1, &ct, 0)) != 0) {
4106                         mutex_enter(&zone_status_lock);
4107                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4108                         mutex_exit(&zone_status_lock);
4109                 } else {
4110                         zone->zone_boot_time = gethrestime_sec();
4111                 }
4112 
4113                 pool_unlock();
4114         }
4115 
4116         /*
4117          * Wait for zone_destroy() to be called.  This is what we spend
4118          * most of our life doing.
4119          */
4120         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4121 
4122         if (ct)
4123                 /*
4124                  * At this point the process contract should be empty.
4125                  * (Though if it isn't, it's not the end of the world.)
4126                  */
4127                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4128 
4129         /*
4130          * Allow kcred to be freed when all referring processes
4131          * (including this one) go away.  We can't just do this in
4132          * zone_free because we need to wait for the zone_cred_ref to
4133          * drop to 0 before calling zone_free, and the existence of
4134          * zone_kcred will prevent that.  Thus, we call crfree here to
4135          * balance the crdup in zone_create.  The crhold calls earlier
4136          * in zsched will be dropped when the thread and process exit.
4137          */
4138         crfree(zone->zone_kcred);
4139         zone->zone_kcred = NULL;
4140 
4141         exit(CLD_EXITED, 0);
4142 }
4143 
4144 /*
4145  * Helper function to determine if there are any submounts of the
4146  * provided path.  Used to make sure the zone doesn't "inherit" any
4147  * mounts from before it is created.
4148  */
4149 static uint_t
4150 zone_mount_count(const char *rootpath)
4151 {
4152         vfs_t *vfsp;
4153         uint_t count = 0;
4154         size_t rootpathlen = strlen(rootpath);
4155 
4156         /*
4157          * Holding zonehash_lock prevents race conditions with
4158          * vfs_list_add()/vfs_list_remove() since we serialize with
4159          * zone_find_by_path().
4160          */
4161         ASSERT(MUTEX_HELD(&zonehash_lock));
4162         /*
4163          * The rootpath must end with a '/'
4164          */
4165         ASSERT(rootpath[rootpathlen - 1] == '/');
4166 
4167         /*
4168          * This intentionally does not count the rootpath itself if that
4169          * happens to be a mount point.
4170          */
4171         vfs_list_read_lock();
4172         vfsp = rootvfs;
4173         do {
4174                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4175                     rootpathlen) == 0)
4176                         count++;
4177                 vfsp = vfsp->vfs_next;
4178         } while (vfsp != rootvfs);
4179         vfs_list_unlock();
4180         return (count);
4181 }
4182 
4183 /*
4184  * Helper function to make sure that a zone created on 'rootpath'
4185  * wouldn't end up containing other zones' rootpaths.
4186  */
4187 static boolean_t
4188 zone_is_nested(const char *rootpath)
4189 {
4190         zone_t *zone;
4191         size_t rootpathlen = strlen(rootpath);
4192         size_t len;
4193 
4194         ASSERT(MUTEX_HELD(&zonehash_lock));
4195 
4196         /*
4197          * zone_set_root() appended '/' and '\0' at the end of rootpath
4198          */
4199         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4200             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4201                 return (B_TRUE);
4202 
4203         for (zone = list_head(&zone_active); zone != NULL;
4204             zone = list_next(&zone_active, zone)) {
4205                 if (zone == global_zone)
4206                         continue;
4207                 len = strlen(zone->zone_rootpath);
4208                 if (strncmp(rootpath, zone->zone_rootpath,
4209                     MIN(rootpathlen, len)) == 0)
4210                         return (B_TRUE);
4211         }
4212         return (B_FALSE);
4213 }
4214 
4215 static int
4216 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4217     size_t zone_privssz)
4218 {
4219         priv_set_t *privs;
4220 
4221         if (zone_privssz < sizeof (priv_set_t))
4222                 return (ENOMEM);
4223 
4224         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4225 
4226         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4227                 kmem_free(privs, sizeof (priv_set_t));
4228                 return (EFAULT);
4229         }
4230 
4231         zone->zone_privset = privs;
4232         return (0);
4233 }
4234 
4235 /*
4236  * We make creative use of nvlists to pass in rctls from userland.  The list is
4237  * a list of the following structures:
4238  *
4239  * (name = rctl_name, value = nvpair_list_array)
4240  *
4241  * Where each element of the nvpair_list_array is of the form:
4242  *
4243  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4244  *      (name = "limit", value = uint64_t),
4245  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4246  */
4247 static int
4248 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4249 {
4250         nvpair_t *nvp = NULL;
4251         nvlist_t *nvl = NULL;
4252         char *kbuf;
4253         int error;
4254         rctl_val_t rv;
4255 
4256         *nvlp = NULL;
4257 
4258         if (buflen == 0)
4259                 return (0);
4260 
4261         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4262                 return (ENOMEM);
4263         if (copyin(ubuf, kbuf, buflen)) {
4264                 error = EFAULT;
4265                 goto out;
4266         }
4267         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4268                 /*
4269                  * nvl may have been allocated/free'd, but the value set to
4270                  * non-NULL, so we reset it here.
4271                  */
4272                 nvl = NULL;
4273                 error = EINVAL;
4274                 goto out;
4275         }
4276         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4277                 rctl_dict_entry_t *rde;
4278                 rctl_hndl_t hndl;
4279                 nvlist_t **nvlarray;
4280                 uint_t i, nelem;
4281                 char *name;
4282 
4283                 error = EINVAL;
4284                 name = nvpair_name(nvp);
4285                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4286                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4287                         goto out;
4288                 }
4289                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4290                         goto out;
4291                 }
4292                 rde = rctl_dict_lookup_hndl(hndl);
4293                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4294                 ASSERT(error == 0);
4295                 for (i = 0; i < nelem; i++) {
4296                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4297                                 goto out;
4298                 }
4299                 if (rctl_invalid_value(rde, &rv)) {
4300                         error = EINVAL;
4301                         goto out;
4302                 }
4303         }
4304         error = 0;
4305         *nvlp = nvl;
4306 out:
4307         kmem_free(kbuf, buflen);
4308         if (error && nvl != NULL)
4309                 nvlist_free(nvl);
4310         return (error);
4311 }
4312 
4313 int
4314 zone_create_error(int er_error, int er_ext, int *er_out)
4315 {
4316         if (er_out != NULL) {
4317                 if (copyout(&er_ext, er_out, sizeof (int))) {
4318                         return (set_errno(EFAULT));
4319                 }
4320         }
4321         return (set_errno(er_error));
4322 }
4323 
4324 static int
4325 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4326 {
4327         ts_label_t *tsl;
4328         bslabel_t blab;
4329 
4330         /* Get label from user */
4331         if (copyin(lab, &blab, sizeof (blab)) != 0)
4332                 return (EFAULT);
4333         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4334         if (tsl == NULL)
4335                 return (ENOMEM);
4336 
4337         zone->zone_slabel = tsl;
4338         return (0);
4339 }
4340 
4341 /*
4342  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4343  */
4344 static int
4345 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4346 {
4347         char *kbuf;
4348         char *dataset, *next;
4349         zone_dataset_t *zd;
4350         size_t len;
4351 
4352         if (ubuf == NULL || buflen == 0)
4353                 return (0);
4354 
4355         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4356                 return (ENOMEM);
4357 
4358         if (copyin(ubuf, kbuf, buflen) != 0) {
4359                 kmem_free(kbuf, buflen);
4360                 return (EFAULT);
4361         }
4362 
4363         dataset = next = kbuf;
4364         for (;;) {
4365                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4366 
4367                 next = strchr(dataset, ',');
4368 
4369                 if (next == NULL)
4370                         len = strlen(dataset);
4371                 else
4372                         len = next - dataset;
4373 
4374                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4375                 bcopy(dataset, zd->zd_dataset, len);
4376                 zd->zd_dataset[len] = '\0';
4377 
4378                 list_insert_head(&zone->zone_datasets, zd);
4379 
4380                 if (next == NULL)
4381                         break;
4382 
4383                 dataset = next + 1;
4384         }
4385 
4386         kmem_free(kbuf, buflen);
4387         return (0);
4388 }
4389 
4390 /*
4391  * System call to create/initialize a new zone named 'zone_name', rooted
4392  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4393  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4394  * with labeling set by 'match', 'doi', and 'label'.
4395  *
4396  * If extended error is non-null, we may use it to return more detailed
4397  * error information.
4398  */
4399 static zoneid_t
4400 zone_create(const char *zone_name, const char *zone_root,
4401     const priv_set_t *zone_privs, size_t zone_privssz,
4402     caddr_t rctlbuf, size_t rctlbufsz,
4403     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4404     int match, uint32_t doi, const bslabel_t *label,
4405     int flags)
4406 {
4407         struct zsched_arg zarg;
4408         nvlist_t *rctls = NULL;
4409         proc_t *pp = curproc;
4410         zone_t *zone, *ztmp;
4411         zoneid_t zoneid, start = GLOBAL_ZONEID;
4412         int error;
4413         int error2 = 0;
4414         char *str;
4415         cred_t *zkcr;
4416         boolean_t insert_label_hash;
4417 
4418         if (secpolicy_zone_config(CRED()) != 0)
4419                 return (set_errno(EPERM));
4420 
4421         /* can't boot zone from within chroot environment */
4422         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4423                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4424                     extended_error));
4425         /*
4426          * As the first step of zone creation, we want to allocate a zoneid.
4427          * This allocation is complicated by the fact that netstacks use the
4428          * zoneid to determine their stackid, but netstacks themselves are
4429          * freed asynchronously with respect to zone destruction.  This means
4430          * that a netstack reference leak (or in principle, an extraordinarily
4431          * long netstack reference hold) could result in a zoneid being
4432          * allocated that in fact corresponds to a stackid from an active
4433          * (referenced) netstack -- unleashing all sorts of havoc when that
4434          * netstack is actually (re)used.  (In the abstract, we might wish a
4435          * zoneid to not be deallocated until its last referencing netstack
4436          * has been released, but netstacks lack a backpointer into their
4437          * referencing zone -- and changing them to have such a pointer would
4438          * be substantial, to put it euphemistically.)  To avoid this, we
4439          * detect this condition on allocation: if we have allocated a zoneid
4440          * that corresponds to a netstack that's still in use, we warn about
4441          * it (as it is much more likely to be a reference leak than an actual
4442          * netstack reference), free it, and allocate another.  That these
4443          * identifers are allocated out of an ID space assures that we won't
4444          * see the identifier we just allocated.
4445          */
4446         for (;;) {
4447                 zoneid = id_alloc(zoneid_space);
4448 
4449                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4450                         break;
4451 
4452                 id_free(zoneid_space, zoneid);
4453 
4454                 if (start == GLOBAL_ZONEID) {
4455                         start = zoneid;
4456                 } else if (zoneid == start) {
4457                         /*
4458                          * We have managed to iterate over the entire available
4459                          * zoneid space -- there are no identifiers available,
4460                          * presumably due to some number of leaked netstack
4461                          * references.  While it's in principle possible for us
4462                          * to continue to try, it seems wiser to give up at
4463                          * this point to warn and fail explicitly with a
4464                          * distinctive error.
4465                          */
4466                         cmn_err(CE_WARN, "zone_create() failed: all available "
4467                             "zone IDs have netstacks still in use");
4468                         return (set_errno(ENFILE));
4469                 }
4470 
4471                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4472                     "netstack still in use", zoneid);
4473         }
4474 
4475         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4476         zone->zone_id = zoneid;
4477         zone->zone_status = ZONE_IS_UNINITIALIZED;
4478         zone->zone_pool = pool_default;
4479         zone->zone_pool_mod = gethrtime();
4480         zone->zone_psetid = ZONE_PS_INVAL;
4481         zone->zone_ncpus = 0;
4482         zone->zone_ncpus_online = 0;
4483         zone->zone_restart_init = B_TRUE;
4484         zone->zone_brand = &native_brand;
4485         zone->zone_initname = NULL;
4486         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4487         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4488         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4489         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4490         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4491             offsetof(zone_ref_t, zref_linkage));
4492         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4493             offsetof(struct zsd_entry, zsd_linkage));
4494         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4495             offsetof(zone_dataset_t, zd_linkage));
4496         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4497             offsetof(zone_dl_t, zdl_linkage));
4498         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4499         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4500 
4501         if (flags & ZCF_NET_EXCL) {
4502                 zone->zone_flags |= ZF_NET_EXCL;
4503         }
4504 
4505         if ((error = zone_set_name(zone, zone_name)) != 0) {
4506                 zone_free(zone);
4507                 return (zone_create_error(error, 0, extended_error));
4508         }
4509 
4510         if ((error = zone_set_root(zone, zone_root)) != 0) {
4511                 zone_free(zone);
4512                 return (zone_create_error(error, 0, extended_error));
4513         }
4514         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4515                 zone_free(zone);
4516                 return (zone_create_error(error, 0, extended_error));
4517         }
4518 
4519         /* initialize node name to be the same as zone name */
4520         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4521         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4522         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4523 
4524         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4525         zone->zone_domain[0] = '\0';
4526         zone->zone_hostid = HW_INVALID_HOSTID;
4527         zone->zone_shares = 1;
4528         zone->zone_shmmax = 0;
4529         zone->zone_ipc.ipcq_shmmni = 0;
4530         zone->zone_ipc.ipcq_semmni = 0;
4531         zone->zone_ipc.ipcq_msgmni = 0;
4532         zone->zone_bootargs = NULL;
4533         zone->zone_fs_allowed = NULL;
4534 
4535         psecflags_default(&zone->zone_secflags);
4536 
4537         zone->zone_initname =
4538             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4539         (void) strcpy(zone->zone_initname, zone_default_initname);
4540         zone->zone_nlwps = 0;
4541         zone->zone_nlwps_ctl = INT_MAX;
4542         zone->zone_nprocs = 0;
4543         zone->zone_nprocs_ctl = INT_MAX;
4544         zone->zone_locked_mem = 0;
4545         zone->zone_locked_mem_ctl = UINT64_MAX;
4546         zone->zone_max_swap = 0;
4547         zone->zone_max_swap_ctl = UINT64_MAX;
4548         zone->zone_max_lofi = 0;
4549         zone->zone_max_lofi_ctl = UINT64_MAX;
4550         zone0.zone_lockedmem_kstat = NULL;
4551         zone0.zone_swapresv_kstat = NULL;
4552 
4553         zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
4554 
4555         /*
4556          * Zsched initializes the rctls.
4557          */
4558         zone->zone_rctls = NULL;
4559 
4560         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4561                 zone_free(zone);
4562                 return (zone_create_error(error, 0, extended_error));
4563         }
4564 
4565         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4566                 zone_free(zone);
4567                 return (set_errno(error));
4568         }
4569 
4570         /*
4571          * Read in the trusted system parameters:
4572          * match flag and sensitivity label.
4573          */
4574         zone->zone_match = match;
4575         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4576                 /* Fail if requested to set doi to anything but system's doi */
4577                 if (doi != 0 && doi != default_doi) {
4578                         zone_free(zone);
4579                         return (set_errno(EINVAL));
4580                 }
4581                 /* Always apply system's doi to the zone */
4582                 error = zone_set_label(zone, label, default_doi);
4583                 if (error != 0) {
4584                         zone_free(zone);
4585                         return (set_errno(error));
4586                 }
4587                 insert_label_hash = B_TRUE;
4588         } else {
4589                 /* all zones get an admin_low label if system is not labeled */
4590                 zone->zone_slabel = l_admin_low;
4591                 label_hold(l_admin_low);
4592                 insert_label_hash = B_FALSE;
4593         }
4594 
4595         /*
4596          * Stop all lwps since that's what normally happens as part of fork().
4597          * This needs to happen before we grab any locks to avoid deadlock
4598          * (another lwp in the process could be waiting for the held lock).
4599          */
4600         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4601                 zone_free(zone);
4602                 nvlist_free(rctls);
4603                 return (zone_create_error(error, 0, extended_error));
4604         }
4605 
4606         if (block_mounts(zone) == 0) {
4607                 mutex_enter(&pp->p_lock);
4608                 if (curthread != pp->p_agenttp)
4609                         continuelwps(pp);
4610                 mutex_exit(&pp->p_lock);
4611                 zone_free(zone);
4612                 nvlist_free(rctls);
4613                 return (zone_create_error(error, 0, extended_error));
4614         }
4615 
4616         /*
4617          * Set up credential for kernel access.  After this, any errors
4618          * should go through the dance in errout rather than calling
4619          * zone_free directly.
4620          */
4621         zone->zone_kcred = crdup(kcred);
4622         crsetzone(zone->zone_kcred, zone);
4623         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4624         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4625         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4626         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4627 
4628         mutex_enter(&zonehash_lock);
4629         /*
4630          * Make sure zone doesn't already exist.
4631          *
4632          * If the system and zone are labeled,
4633          * make sure no other zone exists that has the same label.
4634          */
4635         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4636             (insert_label_hash &&
4637             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4638                 zone_status_t status;
4639 
4640                 status = zone_status_get(ztmp);
4641                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4642                         error = EEXIST;
4643                 else
4644                         error = EBUSY;
4645 
4646                 if (insert_label_hash)
4647                         error2 = ZE_LABELINUSE;
4648 
4649                 goto errout;
4650         }
4651 
4652         /*
4653          * Don't allow zone creations which would cause one zone's rootpath to
4654          * be accessible from that of another (non-global) zone.
4655          */
4656         if (zone_is_nested(zone->zone_rootpath)) {
4657                 error = EBUSY;
4658                 goto errout;
4659         }
4660 
4661         ASSERT(zonecount != 0);         /* check for leaks */
4662         if (zonecount + 1 > maxzones) {
4663                 error = ENOMEM;
4664                 goto errout;
4665         }
4666 
4667         if (zone_mount_count(zone->zone_rootpath) != 0) {
4668                 error = EBUSY;
4669                 error2 = ZE_AREMOUNTS;
4670                 goto errout;
4671         }
4672 
4673         /*
4674          * Zone is still incomplete, but we need to drop all locks while
4675          * zsched() initializes this zone's kernel process.  We
4676          * optimistically add the zone to the hashtable and associated
4677          * lists so a parallel zone_create() doesn't try to create the
4678          * same zone.
4679          */
4680         zonecount++;
4681         (void) mod_hash_insert(zonehashbyid,
4682             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4683             (mod_hash_val_t)(uintptr_t)zone);
4684         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4685         (void) strcpy(str, zone->zone_name);
4686         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4687             (mod_hash_val_t)(uintptr_t)zone);
4688         if (insert_label_hash) {
4689                 (void) mod_hash_insert(zonehashbylabel,
4690                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4691                 zone->zone_flags |= ZF_HASHED_LABEL;
4692         }
4693 
4694         /*
4695          * Insert into active list.  At this point there are no 'hold's
4696          * on the zone, but everyone else knows not to use it, so we can
4697          * continue to use it.  zsched() will do a zone_hold() if the
4698          * newproc() is successful.
4699          */
4700         list_insert_tail(&zone_active, zone);
4701         mutex_exit(&zonehash_lock);
4702 
4703         zarg.zone = zone;
4704         zarg.nvlist = rctls;
4705         /*
4706          * The process, task, and project rctls are probably wrong;
4707          * we need an interface to get the default values of all rctls,
4708          * and initialize zsched appropriately.  I'm not sure that that
4709          * makes much of a difference, though.
4710          */
4711         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4712         if (error != 0) {
4713                 /*
4714                  * We need to undo all globally visible state.
4715                  */
4716                 mutex_enter(&zonehash_lock);
4717                 list_remove(&zone_active, zone);
4718                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4719                         ASSERT(zone->zone_slabel != NULL);
4720                         (void) mod_hash_destroy(zonehashbylabel,
4721                             (mod_hash_key_t)zone->zone_slabel);
4722                 }
4723                 (void) mod_hash_destroy(zonehashbyname,
4724                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4725                 (void) mod_hash_destroy(zonehashbyid,
4726                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4727                 ASSERT(zonecount > 1);
4728                 zonecount--;
4729                 goto errout;
4730         }
4731 
4732         /*
4733          * Zone creation can't fail from now on.
4734          */
4735 
4736         /*
4737          * Create zone kstats
4738          */
4739         zone_kstat_create(zone);
4740 
4741         /*
4742          * Let the other lwps continue.
4743          */
4744         mutex_enter(&pp->p_lock);
4745         if (curthread != pp->p_agenttp)
4746                 continuelwps(pp);
4747         mutex_exit(&pp->p_lock);
4748 
4749         /*
4750          * Wait for zsched to finish initializing the zone.
4751          */
4752         zone_status_wait(zone, ZONE_IS_READY);
4753         /*
4754          * The zone is fully visible, so we can let mounts progress.
4755          */
4756         resume_mounts(zone);
4757         nvlist_free(rctls);
4758 
4759         return (zoneid);
4760 
4761 errout:
4762         mutex_exit(&zonehash_lock);
4763         /*
4764          * Let the other lwps continue.
4765          */
4766         mutex_enter(&pp->p_lock);
4767         if (curthread != pp->p_agenttp)
4768                 continuelwps(pp);
4769         mutex_exit(&pp->p_lock);
4770 
4771         resume_mounts(zone);
4772         nvlist_free(rctls);
4773         /*
4774          * There is currently one reference to the zone, a cred_ref from
4775          * zone_kcred.  To free the zone, we call crfree, which will call
4776          * zone_cred_rele, which will call zone_free.
4777          */
4778         ASSERT(zone->zone_cred_ref == 1);
4779         ASSERT(zone->zone_kcred->cr_ref == 1);
4780         ASSERT(zone->zone_ref == 0);
4781         zkcr = zone->zone_kcred;
4782         zone->zone_kcred = NULL;
4783         crfree(zkcr);                           /* triggers call to zone_free */
4784         return (zone_create_error(error, error2, extended_error));
4785 }
4786 
4787 /*
4788  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4789  * the heavy lifting.  initname is the path to the program to launch
4790  * at the "top" of the zone; if this is NULL, we use the system default,
4791  * which is stored at zone_default_initname.
4792  */
4793 static int
4794 zone_boot(zoneid_t zoneid)
4795 {
4796         int err;
4797         zone_t *zone;
4798 
4799         if (secpolicy_zone_config(CRED()) != 0)
4800                 return (set_errno(EPERM));
4801         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4802                 return (set_errno(EINVAL));
4803 
4804         mutex_enter(&zonehash_lock);
4805         /*
4806          * Look for zone under hash lock to prevent races with calls to
4807          * zone_shutdown, zone_destroy, etc.
4808          */
4809         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4810                 mutex_exit(&zonehash_lock);
4811                 return (set_errno(EINVAL));
4812         }
4813 
4814         mutex_enter(&zone_status_lock);
4815         if (zone_status_get(zone) != ZONE_IS_READY) {
4816                 mutex_exit(&zone_status_lock);
4817                 mutex_exit(&zonehash_lock);
4818                 return (set_errno(EINVAL));
4819         }
4820         zone_status_set(zone, ZONE_IS_BOOTING);
4821         mutex_exit(&zone_status_lock);
4822 
4823         zone_hold(zone);        /* so we can use the zone_t later */
4824         mutex_exit(&zonehash_lock);
4825 
4826         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4827                 zone_rele(zone);
4828                 return (set_errno(EINTR));
4829         }
4830 
4831         /*
4832          * Boot (starting init) might have failed, in which case the zone
4833          * will go to the SHUTTING_DOWN state; an appropriate errno will
4834          * be placed in zone->zone_boot_err, and so we return that.
4835          */
4836         err = zone->zone_boot_err;
4837         zone_rele(zone);
4838         return (err ? set_errno(err) : 0);
4839 }
4840 
4841 /*
4842  * Kills all user processes in the zone, waiting for them all to exit
4843  * before returning.
4844  */
4845 static int
4846 zone_empty(zone_t *zone)
4847 {
4848         int waitstatus;
4849 
4850         /*
4851          * We need to drop zonehash_lock before killing all
4852          * processes, otherwise we'll deadlock with zone_find_*
4853          * which can be called from the exit path.
4854          */
4855         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4856         while ((waitstatus = zone_status_timedwait_sig(zone,
4857             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4858                 killall(zone->zone_id);
4859         }
4860         /*
4861          * return EINTR if we were signaled
4862          */
4863         if (waitstatus == 0)
4864                 return (EINTR);
4865         return (0);
4866 }
4867 
4868 /*
4869  * This function implements the policy for zone visibility.
4870  *
4871  * In standard Solaris, a non-global zone can only see itself.
4872  *
4873  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4874  * it dominates. For this test, the label of the global zone is treated as
4875  * admin_high so it is special-cased instead of being checked for dominance.
4876  *
4877  * Returns true if zone attributes are viewable, false otherwise.
4878  */
4879 static boolean_t
4880 zone_list_access(zone_t *zone)
4881 {
4882 
4883         if (curproc->p_zone == global_zone ||
4884             curproc->p_zone == zone) {
4885                 return (B_TRUE);
4886         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4887                 bslabel_t *curproc_label;
4888                 bslabel_t *zone_label;
4889 
4890                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4891                 zone_label = label2bslabel(zone->zone_slabel);
4892 
4893                 if (zone->zone_id != GLOBAL_ZONEID &&
4894                     bldominates(curproc_label, zone_label)) {
4895                         return (B_TRUE);
4896                 } else {
4897                         return (B_FALSE);
4898                 }
4899         } else {
4900                 return (B_FALSE);
4901         }
4902 }
4903 
4904 /*
4905  * Systemcall to start the zone's halt sequence.  By the time this
4906  * function successfully returns, all user processes and kernel threads
4907  * executing in it will have exited, ZSD shutdown callbacks executed,
4908  * and the zone status set to ZONE_IS_DOWN.
4909  *
4910  * It is possible that the call will interrupt itself if the caller is the
4911  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4912  */
4913 static int
4914 zone_shutdown(zoneid_t zoneid)
4915 {
4916         int error;
4917         zone_t *zone;
4918         zone_status_t status;
4919 
4920         if (secpolicy_zone_config(CRED()) != 0)
4921                 return (set_errno(EPERM));
4922         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4923                 return (set_errno(EINVAL));
4924 
4925         mutex_enter(&zonehash_lock);
4926         /*
4927          * Look for zone under hash lock to prevent races with other
4928          * calls to zone_shutdown and zone_destroy.
4929          */
4930         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4931                 mutex_exit(&zonehash_lock);
4932                 return (set_errno(EINVAL));
4933         }
4934 
4935         /*
4936          * We have to drop zonehash_lock before calling block_mounts.
4937          * Hold the zone so we can continue to use the zone_t.
4938          */
4939         zone_hold(zone);
4940         mutex_exit(&zonehash_lock);
4941 
4942         /*
4943          * Block mounts so that VFS_MOUNT() can get an accurate view of
4944          * the zone's status with regards to ZONE_IS_SHUTTING down.
4945          *
4946          * e.g. NFS can fail the mount if it determines that the zone
4947          * has already begun the shutdown sequence.
4948          *
4949          */
4950         if (block_mounts(zone) == 0) {
4951                 zone_rele(zone);
4952                 return (set_errno(EINTR));
4953         }
4954 
4955         mutex_enter(&zonehash_lock);
4956         mutex_enter(&zone_status_lock);
4957         status = zone_status_get(zone);
4958         /*
4959          * Fail if the zone isn't fully initialized yet.
4960          */
4961         if (status < ZONE_IS_READY) {
4962                 mutex_exit(&zone_status_lock);
4963                 mutex_exit(&zonehash_lock);
4964                 resume_mounts(zone);
4965                 zone_rele(zone);
4966                 return (set_errno(EINVAL));
4967         }
4968         /*
4969          * If conditions required for zone_shutdown() to return have been met,
4970          * return success.
4971          */
4972         if (status >= ZONE_IS_DOWN) {
4973                 mutex_exit(&zone_status_lock);
4974                 mutex_exit(&zonehash_lock);
4975                 resume_mounts(zone);
4976                 zone_rele(zone);
4977                 return (0);
4978         }
4979         /*
4980          * If zone_shutdown() hasn't been called before, go through the motions.
4981          * If it has, there's nothing to do but wait for the kernel threads to
4982          * drain.
4983          */
4984         if (status < ZONE_IS_EMPTY) {
4985                 uint_t ntasks;
4986 
4987                 mutex_enter(&zone->zone_lock);
4988                 if ((ntasks = zone->zone_ntasks) != 1) {
4989                         /*
4990                          * There's still stuff running.
4991                          */
4992                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4993                 }
4994                 mutex_exit(&zone->zone_lock);
4995                 if (ntasks == 1) {
4996                         /*
4997                          * The only way to create another task is through
4998                          * zone_enter(), which will block until we drop
4999                          * zonehash_lock.  The zone is empty.
5000                          */
5001                         if (zone->zone_kthreads == NULL) {
5002                                 /*
5003                                  * Skip ahead to ZONE_IS_DOWN
5004                                  */
5005                                 zone_status_set(zone, ZONE_IS_DOWN);
5006                         } else {
5007                                 zone_status_set(zone, ZONE_IS_EMPTY);
5008                         }
5009                 }
5010         }
5011         mutex_exit(&zone_status_lock);
5012         mutex_exit(&zonehash_lock);
5013         resume_mounts(zone);
5014 
5015         if (error = zone_empty(zone)) {
5016                 zone_rele(zone);
5017                 return (set_errno(error));
5018         }
5019         /*
5020          * After the zone status goes to ZONE_IS_DOWN this zone will no
5021          * longer be notified of changes to the pools configuration, so
5022          * in order to not end up with a stale pool pointer, we point
5023          * ourselves at the default pool and remove all resource
5024          * visibility.  This is especially important as the zone_t may
5025          * languish on the deathrow for a very long time waiting for
5026          * cred's to drain out.
5027          *
5028          * This rebinding of the zone can happen multiple times
5029          * (presumably due to interrupted or parallel systemcalls)
5030          * without any adverse effects.
5031          */
5032         if (pool_lock_intr() != 0) {
5033                 zone_rele(zone);
5034                 return (set_errno(EINTR));
5035         }
5036         if (pool_state == POOL_ENABLED) {
5037                 mutex_enter(&cpu_lock);
5038                 zone_pool_set(zone, pool_default);
5039                 /*
5040                  * The zone no longer needs to be able to see any cpus.
5041                  */
5042                 zone_pset_set(zone, ZONE_PS_INVAL);
5043                 mutex_exit(&cpu_lock);
5044         }
5045         pool_unlock();
5046 
5047         /*
5048          * ZSD shutdown callbacks can be executed multiple times, hence
5049          * it is safe to not be holding any locks across this call.
5050          */
5051         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5052 
5053         mutex_enter(&zone_status_lock);
5054         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5055                 zone_status_set(zone, ZONE_IS_DOWN);
5056         mutex_exit(&zone_status_lock);
5057 
5058         /*
5059          * Wait for kernel threads to drain.
5060          */
5061         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5062                 zone_rele(zone);
5063                 return (set_errno(EINTR));
5064         }
5065 
5066         /*
5067          * Zone can be become down/destroyable even if the above wait
5068          * returns EINTR, so any code added here may never execute.
5069          * (i.e. don't add code here)
5070          */
5071 
5072         zone_rele(zone);
5073         return (0);
5074 }
5075 
5076 /*
5077  * Log the specified zone's reference counts.  The caller should not be
5078  * holding the zone's zone_lock.
5079  */
5080 static void
5081 zone_log_refcounts(zone_t *zone)
5082 {
5083         char *buffer;
5084         char *buffer_position;
5085         uint32_t buffer_size;
5086         uint32_t index;
5087         uint_t ref;
5088         uint_t cred_ref;
5089 
5090         /*
5091          * Construct a string representing the subsystem-specific reference
5092          * counts.  The counts are printed in ascending order by index into the
5093          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5094          * square brackets [] and will only contain nonzero reference counts.
5095          *
5096          * The buffer will hold two square bracket characters plus ten digits,
5097          * one colon, one space, one comma, and some characters for a
5098          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5099          * bit integers have at most ten decimal digits.)  The last
5100          * reference count's comma is replaced by the closing square
5101          * bracket and a NULL character to terminate the string.
5102          *
5103          * NOTE: We have to grab the zone's zone_lock to create a consistent
5104          * snapshot of the zone's reference counters.
5105          *
5106          * First, figure out how much space the string buffer will need.
5107          * The buffer's size is stored in buffer_size.
5108          */
5109         buffer_size = 2;                        /* for the square brackets */
5110         mutex_enter(&zone->zone_lock);
5111         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5112         ref = zone->zone_ref;
5113         cred_ref = zone->zone_cred_ref;
5114         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5115                 if (zone->zone_subsys_ref[index] != 0)
5116                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5117                             13;
5118         if (buffer_size == 2) {
5119                 /*
5120                  * No subsystems had nonzero reference counts.  Don't bother
5121                  * with allocating a buffer; just log the general-purpose and
5122                  * credential reference counts.
5123                  */
5124                 mutex_exit(&zone->zone_lock);
5125                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5126                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5127                     "references and %u credential references are still extant",
5128                     zone->zone_name, zone->zone_id, ref, cred_ref);
5129                 return;
5130         }
5131 
5132         /*
5133          * buffer_size contains the exact number of characters that the
5134          * buffer will need.  Allocate the buffer and fill it with nonzero
5135          * subsystem-specific reference counts.  Surround the results with
5136          * square brackets afterwards.
5137          */
5138         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5139         buffer_position = &buffer[1];
5140         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5141                 /*
5142                  * NOTE: The DDI's version of sprintf() returns a pointer to
5143                  * the modified buffer rather than the number of bytes written
5144                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5145                  * Therefore, we'll use snprintf() with INT_MAX to get the
5146                  * number of bytes written.  Using INT_MAX is safe because
5147                  * the buffer is perfectly sized for the data: we'll never
5148                  * overrun the buffer.
5149                  */
5150                 if (zone->zone_subsys_ref[index] != 0)
5151                         buffer_position += snprintf(buffer_position, INT_MAX,
5152                             "%s: %u,", zone_ref_subsys_names[index],
5153                             zone->zone_subsys_ref[index]);
5154         }
5155         mutex_exit(&zone->zone_lock);
5156         buffer[0] = '[';
5157         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5158         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5159         buffer_position[-1] = ']';
5160 
5161         /*
5162          * Log the reference counts and free the message buffer.
5163          */
5164         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5165             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5166             "%u credential references are still extant %s", zone->zone_name,
5167             zone->zone_id, ref, cred_ref, buffer);
5168         kmem_free(buffer, buffer_size);
5169 }
5170 
5171 /*
5172  * Systemcall entry point to finalize the zone halt process.  The caller
5173  * must have already successfully called zone_shutdown().
5174  *
5175  * Upon successful completion, the zone will have been fully destroyed:
5176  * zsched will have exited, destructor callbacks executed, and the zone
5177  * removed from the list of active zones.
5178  */
5179 static int
5180 zone_destroy(zoneid_t zoneid)
5181 {
5182         uint64_t uniqid;
5183         zone_t *zone;
5184         zone_status_t status;
5185         clock_t wait_time;
5186         boolean_t log_refcounts;
5187 
5188         if (secpolicy_zone_config(CRED()) != 0)
5189                 return (set_errno(EPERM));
5190         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5191                 return (set_errno(EINVAL));
5192 
5193         mutex_enter(&zonehash_lock);
5194         /*
5195          * Look for zone under hash lock to prevent races with other
5196          * calls to zone_destroy.
5197          */
5198         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5199                 mutex_exit(&zonehash_lock);
5200                 return (set_errno(EINVAL));
5201         }
5202 
5203         if (zone_mount_count(zone->zone_rootpath) != 0) {
5204                 mutex_exit(&zonehash_lock);
5205                 return (set_errno(EBUSY));
5206         }
5207         mutex_enter(&zone_status_lock);
5208         status = zone_status_get(zone);
5209         if (status < ZONE_IS_DOWN) {
5210                 mutex_exit(&zone_status_lock);
5211                 mutex_exit(&zonehash_lock);
5212                 return (set_errno(EBUSY));
5213         } else if (status == ZONE_IS_DOWN) {
5214                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5215         }
5216         mutex_exit(&zone_status_lock);
5217         zone_hold(zone);
5218         mutex_exit(&zonehash_lock);
5219 
5220         /*
5221          * wait for zsched to exit
5222          */
5223         zone_status_wait(zone, ZONE_IS_DEAD);
5224         zone_zsd_callbacks(zone, ZSD_DESTROY);
5225         zone->zone_netstack = NULL;
5226         uniqid = zone->zone_uniqid;
5227         zone_rele(zone);
5228         zone = NULL;    /* potentially free'd */
5229 
5230         log_refcounts = B_FALSE;
5231         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5232         mutex_enter(&zonehash_lock);
5233         for (; /* ever */; ) {
5234                 boolean_t unref;
5235                 boolean_t refs_have_been_logged;
5236 
5237                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5238                     zone->zone_uniqid != uniqid) {
5239                         /*
5240                          * The zone has gone away.  Necessary conditions
5241                          * are met, so we return success.
5242                          */
5243                         mutex_exit(&zonehash_lock);
5244                         return (0);
5245                 }
5246                 mutex_enter(&zone->zone_lock);
5247                 unref = ZONE_IS_UNREF(zone);
5248                 refs_have_been_logged = (zone->zone_flags &
5249                     ZF_REFCOUNTS_LOGGED);
5250                 mutex_exit(&zone->zone_lock);
5251                 if (unref) {
5252                         /*
5253                          * There is only one reference to the zone -- that
5254                          * added when the zone was added to the hashtables --
5255                          * and things will remain this way until we drop
5256                          * zonehash_lock... we can go ahead and cleanup the
5257                          * zone.
5258                          */
5259                         break;
5260                 }
5261 
5262                 /*
5263                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5264                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5265                  * some zone's general-purpose reference count reaches one.
5266                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5267                  * on zone_destroy_cv, then log the zone's reference counts and
5268                  * continue to wait for zone_rele() and zone_cred_rele().
5269                  */
5270                 if (!refs_have_been_logged) {
5271                         if (!log_refcounts) {
5272                                 /*
5273                                  * This thread hasn't timed out waiting on
5274                                  * zone_destroy_cv yet.  Wait wait_time clock
5275                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5276                                  * seconds) for the zone's references to clear.
5277                                  */
5278                                 ASSERT(wait_time > 0);
5279                                 wait_time = cv_reltimedwait_sig(
5280                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5281                                     TR_SEC);
5282                                 if (wait_time > 0) {
5283                                         /*
5284                                          * A thread in zone_rele() or
5285                                          * zone_cred_rele() signaled
5286                                          * zone_destroy_cv before this thread's
5287                                          * wait timed out.  The zone might have
5288                                          * only one reference left; find out!
5289                                          */
5290                                         continue;
5291                                 } else if (wait_time == 0) {
5292                                         /* The thread's process was signaled. */
5293                                         mutex_exit(&zonehash_lock);
5294                                         return (set_errno(EINTR));
5295                                 }
5296 
5297                                 /*
5298                                  * The thread timed out while waiting on
5299                                  * zone_destroy_cv.  Even though the thread
5300                                  * timed out, it has to check whether another
5301                                  * thread woke up from zone_destroy_cv and
5302                                  * destroyed the zone.
5303                                  *
5304                                  * If the zone still exists and has more than
5305                                  * one unreleased general-purpose reference,
5306                                  * then log the zone's reference counts.
5307                                  */
5308                                 log_refcounts = B_TRUE;
5309                                 continue;
5310                         }
5311 
5312                         /*
5313                          * The thread already timed out on zone_destroy_cv while
5314                          * waiting for subsystems to release the zone's last
5315                          * general-purpose references.  Log the zone's reference
5316                          * counts and wait indefinitely on zone_destroy_cv.
5317                          */
5318                         zone_log_refcounts(zone);
5319                 }
5320                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5321                         /* The thread's process was signaled. */
5322                         mutex_exit(&zonehash_lock);
5323                         return (set_errno(EINTR));
5324                 }
5325         }
5326 
5327         /*
5328          * Remove CPU cap for this zone now since we're not going to
5329          * fail below this point.
5330          */
5331         cpucaps_zone_remove(zone);
5332 
5333         /* Get rid of the zone's kstats */
5334         zone_kstat_delete(zone);
5335 
5336         /* remove the pfexecd doors */
5337         if (zone->zone_pfexecd != NULL) {
5338                 klpd_freelist(&zone->zone_pfexecd);
5339                 zone->zone_pfexecd = NULL;
5340         }
5341 
5342         /* free brand specific data */
5343         if (ZONE_IS_BRANDED(zone))
5344                 ZBROP(zone)->b_free_brand_data(zone);
5345 
5346         /* Say goodbye to brand framework. */
5347         brand_unregister_zone(zone->zone_brand);
5348 
5349         /*
5350          * It is now safe to let the zone be recreated; remove it from the
5351          * lists.  The memory will not be freed until the last cred
5352          * reference goes away.
5353          */
5354         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5355         zonecount--;
5356         /* remove from active list and hash tables */
5357         list_remove(&zone_active, zone);
5358         (void) mod_hash_destroy(zonehashbyname,
5359             (mod_hash_key_t)zone->zone_name);
5360         (void) mod_hash_destroy(zonehashbyid,
5361             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5362         if (zone->zone_flags & ZF_HASHED_LABEL)
5363                 (void) mod_hash_destroy(zonehashbylabel,
5364                     (mod_hash_key_t)zone->zone_slabel);
5365         mutex_exit(&zonehash_lock);
5366 
5367         /*
5368          * Release the root vnode; we're not using it anymore.  Nor should any
5369          * other thread that might access it exist.
5370          */
5371         if (zone->zone_rootvp != NULL) {
5372                 VN_RELE(zone->zone_rootvp);
5373                 zone->zone_rootvp = NULL;
5374         }
5375 
5376         /* add to deathrow list */
5377         mutex_enter(&zone_deathrow_lock);
5378         list_insert_tail(&zone_deathrow, zone);
5379         mutex_exit(&zone_deathrow_lock);
5380 
5381         /*
5382          * Drop last reference (which was added by zsched()), this will
5383          * free the zone unless there are outstanding cred references.
5384          */
5385         zone_rele(zone);
5386         return (0);
5387 }
5388 
5389 /*
5390  * Systemcall entry point for zone_getattr(2).
5391  */
5392 static ssize_t
5393 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5394 {
5395         size_t size;
5396         int error = 0, err;
5397         zone_t *zone;
5398         char *zonepath;
5399         char *outstr;
5400         zone_status_t zone_status;
5401         pid_t initpid;
5402         boolean_t global = (curzone == global_zone);
5403         boolean_t inzone = (curzone->zone_id == zoneid);
5404         ushort_t flags;
5405         zone_net_data_t *zbuf;
5406 
5407         mutex_enter(&zonehash_lock);
5408         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5409                 mutex_exit(&zonehash_lock);
5410                 return (set_errno(EINVAL));
5411         }
5412         zone_status = zone_status_get(zone);
5413         if (zone_status < ZONE_IS_INITIALIZED) {
5414                 mutex_exit(&zonehash_lock);
5415                 return (set_errno(EINVAL));
5416         }
5417         zone_hold(zone);
5418         mutex_exit(&zonehash_lock);
5419 
5420         /*
5421          * If not in the global zone, don't show information about other zones,
5422          * unless the system is labeled and the local zone's label dominates
5423          * the other zone.
5424          */
5425         if (!zone_list_access(zone)) {
5426                 zone_rele(zone);
5427                 return (set_errno(EINVAL));
5428         }
5429 
5430         switch (attr) {
5431         case ZONE_ATTR_ROOT:
5432                 if (global) {
5433                         /*
5434                          * Copy the path to trim the trailing "/" (except for
5435                          * the global zone).
5436                          */
5437                         if (zone != global_zone)
5438                                 size = zone->zone_rootpathlen - 1;
5439                         else
5440                                 size = zone->zone_rootpathlen;
5441                         zonepath = kmem_alloc(size, KM_SLEEP);
5442                         bcopy(zone->zone_rootpath, zonepath, size);
5443                         zonepath[size - 1] = '\0';
5444                 } else {
5445                         if (inzone || !is_system_labeled()) {
5446                                 /*
5447                                  * Caller is not in the global zone.
5448                                  * if the query is on the current zone
5449                                  * or the system is not labeled,
5450                                  * just return faked-up path for current zone.
5451                                  */
5452                                 zonepath = "/";
5453                                 size = 2;
5454                         } else {
5455                                 /*
5456                                  * Return related path for current zone.
5457                                  */
5458                                 int prefix_len = strlen(zone_prefix);
5459                                 int zname_len = strlen(zone->zone_name);
5460 
5461                                 size = prefix_len + zname_len + 1;
5462                                 zonepath = kmem_alloc(size, KM_SLEEP);
5463                                 bcopy(zone_prefix, zonepath, prefix_len);
5464                                 bcopy(zone->zone_name, zonepath +
5465                                     prefix_len, zname_len);
5466                                 zonepath[size - 1] = '\0';
5467                         }
5468                 }
5469                 if (bufsize > size)
5470                         bufsize = size;
5471                 if (buf != NULL) {
5472                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5473                         if (err != 0 && err != ENAMETOOLONG)
5474                                 error = EFAULT;
5475                 }
5476                 if (global || (is_system_labeled() && !inzone))
5477                         kmem_free(zonepath, size);
5478                 break;
5479 
5480         case ZONE_ATTR_NAME:
5481                 size = strlen(zone->zone_name) + 1;
5482                 if (bufsize > size)
5483                         bufsize = size;
5484                 if (buf != NULL) {
5485                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5486                         if (err != 0 && err != ENAMETOOLONG)
5487                                 error = EFAULT;
5488                 }
5489                 break;
5490 
5491         case ZONE_ATTR_STATUS:
5492                 /*
5493                  * Since we're not holding zonehash_lock, the zone status
5494                  * may be anything; leave it up to userland to sort it out.
5495                  */
5496                 size = sizeof (zone_status);
5497                 if (bufsize > size)
5498                         bufsize = size;
5499                 zone_status = zone_status_get(zone);
5500                 if (buf != NULL &&
5501                     copyout(&zone_status, buf, bufsize) != 0)
5502                         error = EFAULT;
5503                 break;
5504         case ZONE_ATTR_FLAGS:
5505                 size = sizeof (zone->zone_flags);
5506                 if (bufsize > size)
5507                         bufsize = size;
5508                 flags = zone->zone_flags;
5509                 if (buf != NULL &&
5510                     copyout(&flags, buf, bufsize) != 0)
5511                         error = EFAULT;
5512                 break;
5513         case ZONE_ATTR_PRIVSET:
5514                 size = sizeof (priv_set_t);
5515                 if (bufsize > size)
5516                         bufsize = size;
5517                 if (buf != NULL &&
5518                     copyout(zone->zone_privset, buf, bufsize) != 0)
5519                         error = EFAULT;
5520                 break;
5521         case ZONE_ATTR_UNIQID:
5522                 size = sizeof (zone->zone_uniqid);
5523                 if (bufsize > size)
5524                         bufsize = size;
5525                 if (buf != NULL &&
5526                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5527                         error = EFAULT;
5528                 break;
5529         case ZONE_ATTR_POOLID:
5530                 {
5531                         pool_t *pool;
5532                         poolid_t poolid;
5533 
5534                         if (pool_lock_intr() != 0) {
5535                                 error = EINTR;
5536                                 break;
5537                         }
5538                         pool = zone_pool_get(zone);
5539                         poolid = pool->pool_id;
5540                         pool_unlock();
5541                         size = sizeof (poolid);
5542                         if (bufsize > size)
5543                                 bufsize = size;
5544                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5545                                 error = EFAULT;
5546                 }
5547                 break;
5548         case ZONE_ATTR_SLBL:
5549                 size = sizeof (bslabel_t);
5550                 if (bufsize > size)
5551                         bufsize = size;
5552                 if (zone->zone_slabel == NULL)
5553                         error = EINVAL;
5554                 else if (buf != NULL &&
5555                     copyout(label2bslabel(zone->zone_slabel), buf,
5556                     bufsize) != 0)
5557                         error = EFAULT;
5558                 break;
5559         case ZONE_ATTR_INITPID:
5560                 size = sizeof (initpid);
5561                 if (bufsize > size)
5562                         bufsize = size;
5563                 initpid = zone->zone_proc_initpid;
5564                 if (initpid == -1) {
5565                         error = ESRCH;
5566                         break;
5567                 }
5568                 if (buf != NULL &&
5569                     copyout(&initpid, buf, bufsize) != 0)
5570                         error = EFAULT;
5571                 break;
5572         case ZONE_ATTR_BRAND:
5573                 size = strlen(zone->zone_brand->b_name) + 1;
5574 
5575                 if (bufsize > size)
5576                         bufsize = size;
5577                 if (buf != NULL) {
5578                         err = copyoutstr(zone->zone_brand->b_name, buf,
5579                             bufsize, NULL);
5580                         if (err != 0 && err != ENAMETOOLONG)
5581                                 error = EFAULT;
5582                 }
5583                 break;
5584         case ZONE_ATTR_INITNAME:
5585                 size = strlen(zone->zone_initname) + 1;
5586                 if (bufsize > size)
5587                         bufsize = size;
5588                 if (buf != NULL) {
5589                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5590                             NULL);
5591                         if (err != 0 && err != ENAMETOOLONG)
5592                                 error = EFAULT;
5593                 }
5594                 break;
5595         case ZONE_ATTR_BOOTARGS:
5596                 if (zone->zone_bootargs == NULL)
5597                         outstr = "";
5598                 else
5599                         outstr = zone->zone_bootargs;
5600                 size = strlen(outstr) + 1;
5601                 if (bufsize > size)
5602                         bufsize = size;
5603                 if (buf != NULL) {
5604                         err = copyoutstr(outstr, buf, bufsize, NULL);
5605                         if (err != 0 && err != ENAMETOOLONG)
5606                                 error = EFAULT;
5607                 }
5608                 break;
5609         case ZONE_ATTR_PHYS_MCAP:
5610                 size = sizeof (zone->zone_phys_mcap);
5611                 if (bufsize > size)
5612                         bufsize = size;
5613                 if (buf != NULL &&
5614                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5615                         error = EFAULT;
5616                 break;
5617         case ZONE_ATTR_SCHED_CLASS:
5618                 mutex_enter(&class_lock);
5619 
5620                 if (zone->zone_defaultcid >= loaded_classes)
5621                         outstr = "";
5622                 else
5623                         outstr = sclass[zone->zone_defaultcid].cl_name;
5624                 size = strlen(outstr) + 1;
5625                 if (bufsize > size)
5626                         bufsize = size;
5627                 if (buf != NULL) {
5628                         err = copyoutstr(outstr, buf, bufsize, NULL);
5629                         if (err != 0 && err != ENAMETOOLONG)
5630                                 error = EFAULT;
5631                 }
5632 
5633                 mutex_exit(&class_lock);
5634                 break;
5635         case ZONE_ATTR_HOSTID:
5636                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5637                     bufsize == sizeof (zone->zone_hostid)) {
5638                         size = sizeof (zone->zone_hostid);
5639                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5640                             bufsize) != 0)
5641                                 error = EFAULT;
5642                 } else {
5643                         error = EINVAL;
5644                 }
5645                 break;
5646         case ZONE_ATTR_FS_ALLOWED:
5647                 if (zone->zone_fs_allowed == NULL)
5648                         outstr = "";
5649                 else
5650                         outstr = zone->zone_fs_allowed;
5651                 size = strlen(outstr) + 1;
5652                 if (bufsize > size)
5653                         bufsize = size;
5654                 if (buf != NULL) {
5655                         err = copyoutstr(outstr, buf, bufsize, NULL);
5656                         if (err != 0 && err != ENAMETOOLONG)
5657                                 error = EFAULT;
5658                 }
5659                 break;
5660         case ZONE_ATTR_SECFLAGS:
5661                 size = sizeof (zone->zone_secflags);
5662                 if (bufsize > size)
5663                         bufsize = size;
5664                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5665                         error = EFAULT;
5666                 break;
5667         case ZONE_ATTR_NETWORK:
5668                 bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5669                 size = bufsize;
5670                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5671                 if (copyin(buf, zbuf, bufsize) != 0) {
5672                         error = EFAULT;
5673                 } else {
5674                         error = zone_get_network(zoneid, zbuf);
5675                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5676                                 error = EFAULT;
5677                 }
5678                 kmem_free(zbuf, bufsize);
5679                 break;
5680         default:
5681                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5682                         size = bufsize;
5683                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5684                 } else {
5685                         error = EINVAL;
5686                 }
5687         }
5688         zone_rele(zone);
5689 
5690         if (error)
5691                 return (set_errno(error));
5692         return ((ssize_t)size);
5693 }
5694 
5695 /*
5696  * Systemcall entry point for zone_setattr(2).
5697  */
5698 /*ARGSUSED*/
5699 static int
5700 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5701 {
5702         zone_t *zone;
5703         zone_status_t zone_status;
5704         int err = -1;
5705         zone_net_data_t *zbuf;
5706 
5707         if (secpolicy_zone_config(CRED()) != 0)
5708                 return (set_errno(EPERM));
5709 
5710         /*
5711          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5712          * global zone.
5713          */
5714         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5715                 return (set_errno(EINVAL));
5716         }
5717 
5718         mutex_enter(&zonehash_lock);
5719         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5720                 mutex_exit(&zonehash_lock);
5721                 return (set_errno(EINVAL));
5722         }
5723         zone_hold(zone);
5724         mutex_exit(&zonehash_lock);
5725 
5726         /*
5727          * At present most attributes can only be set on non-running,
5728          * non-global zones.
5729          */
5730         zone_status = zone_status_get(zone);
5731         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5732                 err = EINVAL;
5733                 goto done;
5734         }
5735 
5736         switch (attr) {
5737         case ZONE_ATTR_INITNAME:
5738                 err = zone_set_initname(zone, (const char *)buf);
5739                 break;
5740         case ZONE_ATTR_INITNORESTART:
5741                 zone->zone_restart_init = B_FALSE;
5742                 err = 0;
5743                 break;
5744         case ZONE_ATTR_BOOTARGS:
5745                 err = zone_set_bootargs(zone, (const char *)buf);
5746                 break;
5747         case ZONE_ATTR_BRAND:
5748                 err = zone_set_brand(zone, (const char *)buf);
5749                 break;
5750         case ZONE_ATTR_FS_ALLOWED:
5751                 err = zone_set_fs_allowed(zone, (const char *)buf);
5752                 break;
5753         case ZONE_ATTR_SECFLAGS:
5754                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5755                 break;
5756         case ZONE_ATTR_PHYS_MCAP:
5757                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5758                 break;
5759         case ZONE_ATTR_SCHED_CLASS:
5760                 err = zone_set_sched_class(zone, (const char *)buf);
5761                 break;
5762         case ZONE_ATTR_HOSTID:
5763                 if (bufsize == sizeof (zone->zone_hostid)) {
5764                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5765                                 err = 0;
5766                         else
5767                                 err = EFAULT;
5768                 } else {
5769                         err = EINVAL;
5770                 }
5771                 break;
5772         case ZONE_ATTR_NETWORK:
5773                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5774                         err = EINVAL;
5775                         break;
5776                 }
5777                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5778                 if (copyin(buf, zbuf, bufsize) != 0) {
5779                         kmem_free(zbuf, bufsize);
5780                         err = EFAULT;
5781                         break;
5782                 }
5783                 err = zone_set_network(zoneid, zbuf);
5784                 kmem_free(zbuf, bufsize);
5785                 break;
5786         default:
5787                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5788                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5789                 else
5790                         err = EINVAL;
5791         }
5792 
5793 done:
5794         zone_rele(zone);
5795         ASSERT(err != -1);
5796         return (err != 0 ? set_errno(err) : 0);
5797 }
5798 
5799 /*
5800  * Return zero if the process has at least one vnode mapped in to its
5801  * address space which shouldn't be allowed to change zones.
5802  *
5803  * Also return zero if the process has any shared mappings which reserve
5804  * swap.  This is because the counting for zone.max-swap does not allow swap
5805  * reservation to be shared between zones.  zone swap reservation is counted
5806  * on zone->zone_max_swap.
5807  */
5808 static int
5809 as_can_change_zones(void)
5810 {
5811         proc_t *pp = curproc;
5812         struct seg *seg;
5813         struct as *as = pp->p_as;
5814         vnode_t *vp;
5815         int allow = 1;
5816 
5817         ASSERT(pp->p_as != &kas);
5818         AS_LOCK_ENTER(as, RW_READER);
5819         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5820 
5821                 /*
5822                  * Cannot enter zone with shared anon memory which
5823                  * reserves swap.  See comment above.
5824                  */
5825                 if (seg_can_change_zones(seg) == B_FALSE) {
5826                         allow = 0;
5827                         break;
5828                 }
5829                 /*
5830                  * if we can't get a backing vnode for this segment then skip
5831                  * it.
5832                  */
5833                 vp = NULL;
5834                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5835                         continue;
5836                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5837                         allow = 0;
5838                         break;
5839                 }
5840         }
5841         AS_LOCK_EXIT(as);
5842         return (allow);
5843 }
5844 
5845 /*
5846  * Count swap reserved by curproc's address space
5847  */
5848 static size_t
5849 as_swresv(void)
5850 {
5851         proc_t *pp = curproc;
5852         struct seg *seg;
5853         struct as *as = pp->p_as;
5854         size_t swap = 0;
5855 
5856         ASSERT(pp->p_as != &kas);
5857         ASSERT(AS_WRITE_HELD(as));
5858         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5859                 swap += seg_swresv(seg);
5860 
5861         return (swap);
5862 }
5863 
5864 /*
5865  * Systemcall entry point for zone_enter().
5866  *
5867  * The current process is injected into said zone.  In the process
5868  * it will change its project membership, privileges, rootdir/cwd,
5869  * zone-wide rctls, and pool association to match those of the zone.
5870  *
5871  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5872  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5873  * enter a zone that is "ready" or "running".
5874  */
5875 static int
5876 zone_enter(zoneid_t zoneid)
5877 {
5878         zone_t *zone;
5879         vnode_t *vp;
5880         proc_t *pp = curproc;
5881         contract_t *ct;
5882         cont_process_t *ctp;
5883         task_t *tk, *oldtk;
5884         kproject_t *zone_proj0;
5885         cred_t *cr, *newcr;
5886         pool_t *oldpool, *newpool;
5887         sess_t *sp;
5888         uid_t uid;
5889         zone_status_t status;
5890         int err = 0;
5891         rctl_entity_p_t e;
5892         size_t swap;
5893         kthread_id_t t;
5894 
5895         if (secpolicy_zone_config(CRED()) != 0)
5896                 return (set_errno(EPERM));
5897         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5898                 return (set_errno(EINVAL));
5899 
5900         /*
5901          * Stop all lwps so we don't need to hold a lock to look at
5902          * curproc->p_zone.  This needs to happen before we grab any
5903          * locks to avoid deadlock (another lwp in the process could
5904          * be waiting for the held lock).
5905          */
5906         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5907                 return (set_errno(EINTR));
5908 
5909         /*
5910          * Make sure we're not changing zones with files open or mapped in
5911          * to our address space which shouldn't be changing zones.
5912          */
5913         if (!files_can_change_zones()) {
5914                 err = EBADF;
5915                 goto out;
5916         }
5917         if (!as_can_change_zones()) {
5918                 err = EFAULT;
5919                 goto out;
5920         }
5921 
5922         mutex_enter(&zonehash_lock);
5923         if (pp->p_zone != global_zone) {
5924                 mutex_exit(&zonehash_lock);
5925                 err = EINVAL;
5926                 goto out;
5927         }
5928 
5929         zone = zone_find_all_by_id(zoneid);
5930         if (zone == NULL) {
5931                 mutex_exit(&zonehash_lock);
5932                 err = EINVAL;
5933                 goto out;
5934         }
5935 
5936         /*
5937          * To prevent processes in a zone from holding contracts on
5938          * extrazonal resources, and to avoid process contract
5939          * memberships which span zones, contract holders and processes
5940          * which aren't the sole members of their encapsulating process
5941          * contracts are not allowed to zone_enter.
5942          */
5943         ctp = pp->p_ct_process;
5944         ct = &ctp->conp_contract;
5945         mutex_enter(&ct->ct_lock);
5946         mutex_enter(&pp->p_lock);
5947         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5948                 mutex_exit(&pp->p_lock);
5949                 mutex_exit(&ct->ct_lock);
5950                 mutex_exit(&zonehash_lock);
5951                 err = EINVAL;
5952                 goto out;
5953         }
5954 
5955         /*
5956          * Moreover, we don't allow processes whose encapsulating
5957          * process contracts have inherited extrazonal contracts.
5958          * While it would be easier to eliminate all process contracts
5959          * with inherited contracts, we need to be able to give a
5960          * restarted init (or other zone-penetrating process) its
5961          * predecessor's contracts.
5962          */
5963         if (ctp->conp_ninherited != 0) {
5964                 contract_t *next;
5965                 for (next = list_head(&ctp->conp_inherited); next;
5966                     next = list_next(&ctp->conp_inherited, next)) {
5967                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5968                                 mutex_exit(&pp->p_lock);
5969                                 mutex_exit(&ct->ct_lock);
5970                                 mutex_exit(&zonehash_lock);
5971                                 err = EINVAL;
5972                                 goto out;
5973                         }
5974                 }
5975         }
5976 
5977         mutex_exit(&pp->p_lock);
5978         mutex_exit(&ct->ct_lock);
5979 
5980         status = zone_status_get(zone);
5981         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5982                 /*
5983                  * Can't join
5984                  */
5985                 mutex_exit(&zonehash_lock);
5986                 err = EINVAL;
5987                 goto out;
5988         }
5989 
5990         /*
5991          * Make sure new priv set is within the permitted set for caller
5992          */
5993         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5994                 mutex_exit(&zonehash_lock);
5995                 err = EPERM;
5996                 goto out;
5997         }
5998         /*
5999          * We want to momentarily drop zonehash_lock while we optimistically
6000          * bind curproc to the pool it should be running in.  This is safe
6001          * since the zone can't disappear (we have a hold on it).
6002          */
6003         zone_hold(zone);
6004         mutex_exit(&zonehash_lock);
6005 
6006         /*
6007          * Grab pool_lock to keep the pools configuration from changing
6008          * and to stop ourselves from getting rebound to another pool
6009          * until we join the zone.
6010          */
6011         if (pool_lock_intr() != 0) {
6012                 zone_rele(zone);
6013                 err = EINTR;
6014                 goto out;
6015         }
6016         ASSERT(secpolicy_pool(CRED()) == 0);
6017         /*
6018          * Bind ourselves to the pool currently associated with the zone.
6019          */
6020         oldpool = curproc->p_pool;
6021         newpool = zone_pool_get(zone);
6022         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6023             (err = pool_do_bind(newpool, P_PID, P_MYID,
6024             POOL_BIND_ALL)) != 0) {
6025                 pool_unlock();
6026                 zone_rele(zone);
6027                 goto out;
6028         }
6029 
6030         /*
6031          * Grab cpu_lock now; we'll need it later when we call
6032          * task_join().
6033          */
6034         mutex_enter(&cpu_lock);
6035         mutex_enter(&zonehash_lock);
6036         /*
6037          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6038          */
6039         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6040                 /*
6041                  * Can't join anymore.
6042                  */
6043                 mutex_exit(&zonehash_lock);
6044                 mutex_exit(&cpu_lock);
6045                 if (pool_state == POOL_ENABLED &&
6046                     newpool != oldpool)
6047                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6048                             POOL_BIND_ALL);
6049                 pool_unlock();
6050                 zone_rele(zone);
6051                 err = EINVAL;
6052                 goto out;
6053         }
6054 
6055         /*
6056          * a_lock must be held while transfering locked memory and swap
6057          * reservation from the global zone to the non global zone because
6058          * asynchronous faults on the processes' address space can lock
6059          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6060          * segments respectively.
6061          */
6062         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6063         swap = as_swresv();
6064         mutex_enter(&pp->p_lock);
6065         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6066         /* verify that we do not exceed and task or lwp limits */
6067         mutex_enter(&zone->zone_nlwps_lock);
6068         /* add new lwps to zone and zone's proj0 */
6069         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6070         zone->zone_nlwps += pp->p_lwpcnt;
6071         /* add 1 task to zone's proj0 */
6072         zone_proj0->kpj_ntasks += 1;
6073 
6074         zone_proj0->kpj_nprocs++;
6075         zone->zone_nprocs++;
6076         mutex_exit(&zone->zone_nlwps_lock);
6077 
6078         mutex_enter(&zone->zone_mem_lock);
6079         zone->zone_locked_mem += pp->p_locked_mem;
6080         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6081         zone->zone_max_swap += swap;
6082         mutex_exit(&zone->zone_mem_lock);
6083 
6084         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6085         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6086         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6087 
6088         /* remove lwps and process from proc's old zone and old project */
6089         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6090         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6091         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6092         pp->p_task->tk_proj->kpj_nprocs--;
6093         pp->p_zone->zone_nprocs--;
6094         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6095 
6096         mutex_enter(&pp->p_zone->zone_mem_lock);
6097         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6098         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6099         pp->p_zone->zone_max_swap -= swap;
6100         mutex_exit(&pp->p_zone->zone_mem_lock);
6101 
6102         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6103         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6104         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6105 
6106         pp->p_flag |= SZONETOP;
6107         pp->p_zone = zone;
6108         mutex_exit(&pp->p_lock);
6109         AS_LOCK_EXIT(pp->p_as);
6110 
6111         /*
6112          * Joining the zone cannot fail from now on.
6113          *
6114          * This means that a lot of the following code can be commonized and
6115          * shared with zsched().
6116          */
6117 
6118         /*
6119          * If the process contract fmri was inherited, we need to
6120          * flag this so that any contract status will not leak
6121          * extra zone information, svc_fmri in this case
6122          */
6123         if (ctp->conp_svc_ctid != ct->ct_id) {
6124                 mutex_enter(&ct->ct_lock);
6125                 ctp->conp_svc_zone_enter = ct->ct_id;
6126                 mutex_exit(&ct->ct_lock);
6127         }
6128 
6129         /*
6130          * Reset the encapsulating process contract's zone.
6131          */
6132         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6133         contract_setzuniqid(ct, zone->zone_uniqid);
6134 
6135         /*
6136          * Create a new task and associate the process with the project keyed
6137          * by (projid,zoneid).
6138          *
6139          * We might as well be in project 0; the global zone's projid doesn't
6140          * make much sense in a zone anyhow.
6141          *
6142          * This also increments zone_ntasks, and returns with p_lock held.
6143          */
6144         tk = task_create(0, zone);
6145         oldtk = task_join(tk, 0);
6146         mutex_exit(&cpu_lock);
6147 
6148         /*
6149          * call RCTLOP_SET functions on this proc
6150          */
6151         e.rcep_p.zone = zone;
6152         e.rcep_t = RCENTITY_ZONE;
6153         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6154             RCD_CALLBACK);
6155         mutex_exit(&pp->p_lock);
6156 
6157         /*
6158          * We don't need to hold any of zsched's locks here; not only do we know
6159          * the process and zone aren't going away, we know its session isn't
6160          * changing either.
6161          *
6162          * By joining zsched's session here, we mimic the behavior in the
6163          * global zone of init's sid being the pid of sched.  We extend this
6164          * to all zlogin-like zone_enter()'ing processes as well.
6165          */
6166         mutex_enter(&pidlock);
6167         sp = zone->zone_zsched->p_sessp;
6168         sess_hold(zone->zone_zsched);
6169         mutex_enter(&pp->p_lock);
6170         pgexit(pp);
6171         sess_rele(pp->p_sessp, B_TRUE);
6172         pp->p_sessp = sp;
6173         pgjoin(pp, zone->zone_zsched->p_pidp);
6174 
6175         /*
6176          * If any threads are scheduled to be placed on zone wait queue they
6177          * should abandon the idea since the wait queue is changing.
6178          * We need to be holding pidlock & p_lock to do this.
6179          */
6180         if ((t = pp->p_tlist) != NULL) {
6181                 do {
6182                         thread_lock(t);
6183                         /*
6184                          * Kick this thread so that it doesn't sit
6185                          * on a wrong wait queue.
6186                          */
6187                         if (ISWAITING(t))
6188                                 setrun_locked(t);
6189 
6190                         if (t->t_schedflag & TS_ANYWAITQ)
6191                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6192 
6193                         thread_unlock(t);
6194                 } while ((t = t->t_forw) != pp->p_tlist);
6195         }
6196 
6197         /*
6198          * If there is a default scheduling class for the zone and it is not
6199          * the class we are currently in, change all of the threads in the
6200          * process to the new class.  We need to be holding pidlock & p_lock
6201          * when we call parmsset so this is a good place to do it.
6202          */
6203         if (zone->zone_defaultcid > 0 &&
6204             zone->zone_defaultcid != curthread->t_cid) {
6205                 pcparms_t pcparms;
6206 
6207                 pcparms.pc_cid = zone->zone_defaultcid;
6208                 pcparms.pc_clparms[0] = 0;
6209 
6210                 /*
6211                  * If setting the class fails, we still want to enter the zone.
6212                  */
6213                 if ((t = pp->p_tlist) != NULL) {
6214                         do {
6215                                 (void) parmsset(&pcparms, t);
6216                         } while ((t = t->t_forw) != pp->p_tlist);
6217                 }
6218         }
6219 
6220         mutex_exit(&pp->p_lock);
6221         mutex_exit(&pidlock);
6222 
6223         mutex_exit(&zonehash_lock);
6224         /*
6225          * We're firmly in the zone; let pools progress.
6226          */
6227         pool_unlock();
6228         task_rele(oldtk);
6229         /*
6230          * We don't need to retain a hold on the zone since we already
6231          * incremented zone_ntasks, so the zone isn't going anywhere.
6232          */
6233         zone_rele(zone);
6234 
6235         /*
6236          * Chroot
6237          */
6238         vp = zone->zone_rootvp;
6239         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6240         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6241 
6242         /*
6243          * Change process security flags.  Note that the _effective_ flags
6244          * cannot change
6245          */
6246         secflags_copy(&pp->p_secflags.psf_lower,
6247             &zone->zone_secflags.psf_lower);
6248         secflags_copy(&pp->p_secflags.psf_upper,
6249             &zone->zone_secflags.psf_upper);
6250         secflags_copy(&pp->p_secflags.psf_inherit,
6251             &zone->zone_secflags.psf_inherit);
6252 
6253         /*
6254          * Change process credentials
6255          */
6256         newcr = cralloc();
6257         mutex_enter(&pp->p_crlock);
6258         cr = pp->p_cred;
6259         crcopy_to(cr, newcr);
6260         crsetzone(newcr, zone);
6261         pp->p_cred = newcr;
6262 
6263         /*
6264          * Restrict all process privilege sets to zone limit
6265          */
6266         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6267         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6268         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6269         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6270         mutex_exit(&pp->p_crlock);
6271         crset(pp, newcr);
6272 
6273         /*
6274          * Adjust upcount to reflect zone entry.
6275          */
6276         uid = crgetruid(newcr);
6277         mutex_enter(&pidlock);
6278         upcount_dec(uid, GLOBAL_ZONEID);
6279         upcount_inc(uid, zoneid);
6280         mutex_exit(&pidlock);
6281 
6282         /*
6283          * Set up core file path and content.
6284          */
6285         set_core_defaults();
6286 
6287 out:
6288         /*
6289          * Let the other lwps continue.
6290          */
6291         mutex_enter(&pp->p_lock);
6292         if (curthread != pp->p_agenttp)
6293                 continuelwps(pp);
6294         mutex_exit(&pp->p_lock);
6295 
6296         return (err != 0 ? set_errno(err) : 0);
6297 }
6298 
6299 /*
6300  * Systemcall entry point for zone_list(2).
6301  *
6302  * Processes running in a (non-global) zone only see themselves.
6303  * On labeled systems, they see all zones whose label they dominate.
6304  */
6305 static int
6306 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6307 {
6308         zoneid_t *zoneids;
6309         zone_t *zone, *myzone;
6310         uint_t user_nzones, real_nzones;
6311         uint_t domi_nzones;
6312         int error;
6313 
6314         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6315                 return (set_errno(EFAULT));
6316 
6317         myzone = curproc->p_zone;
6318         ASSERT(zonecount > 0);
6319         if (myzone != global_zone) {
6320                 bslabel_t *mybslab;
6321 
6322                 if (!is_system_labeled()) {
6323                         /* just return current zone */
6324                         real_nzones = domi_nzones = 1;
6325                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6326                         zoneids[0] = myzone->zone_id;
6327                 } else {
6328                         /* return all zones that are dominated */
6329                         mutex_enter(&zonehash_lock);
6330                         real_nzones = zonecount;
6331                         domi_nzones = 0;
6332                         zoneids = kmem_alloc(real_nzones *
6333                             sizeof (zoneid_t), KM_SLEEP);
6334                         mybslab = label2bslabel(myzone->zone_slabel);
6335                         for (zone = list_head(&zone_active);
6336                             zone != NULL;
6337                             zone = list_next(&zone_active, zone)) {
6338                                 if (zone->zone_id == GLOBAL_ZONEID)
6339                                         continue;
6340                                 if (zone != myzone &&
6341                                     (zone->zone_flags & ZF_IS_SCRATCH))
6342                                         continue;
6343                                 /*
6344                                  * Note that a label always dominates
6345                                  * itself, so myzone is always included
6346                                  * in the list.
6347                                  */
6348                                 if (bldominates(mybslab,
6349                                     label2bslabel(zone->zone_slabel))) {
6350                                         zoneids[domi_nzones++] = zone->zone_id;
6351                                 }
6352                         }
6353                         mutex_exit(&zonehash_lock);
6354                 }
6355         } else {
6356                 mutex_enter(&zonehash_lock);
6357                 real_nzones = zonecount;
6358                 domi_nzones = 0;
6359                 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), KM_SLEEP);
6360                 for (zone = list_head(&zone_active); zone != NULL;
6361                     zone = list_next(&zone_active, zone))
6362                         zoneids[domi_nzones++] = zone->zone_id;
6363 
6364                 ASSERT(domi_nzones == real_nzones);
6365                 mutex_exit(&zonehash_lock);
6366         }
6367 
6368         /*
6369          * If user has allocated space for fewer entries than we found, then
6370          * return only up to their limit.  Either way, tell them exactly how
6371          * many we found.
6372          */
6373         if (domi_nzones < user_nzones)
6374                 user_nzones = domi_nzones;
6375         error = 0;
6376         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6377                 error = EFAULT;
6378         } else if (zoneidlist != NULL && user_nzones != 0) {
6379                 if (copyout(zoneids, zoneidlist,
6380                     user_nzones * sizeof (zoneid_t)) != 0)
6381                         error = EFAULT;
6382         }
6383 
6384         kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6385 
6386         if (error != 0)
6387                 return (set_errno(error));
6388         else
6389                 return (0);
6390 }
6391 
6392 /*
6393  * Systemcall entry point for zone_lookup(2).
6394  *
6395  * Non-global zones are only able to see themselves and (on labeled systems)
6396  * the zones they dominate.
6397  */
6398 static zoneid_t
6399 zone_lookup(const char *zone_name)
6400 {
6401         char *kname;
6402         zone_t *zone;
6403         zoneid_t zoneid;
6404         int err;
6405 
6406         if (zone_name == NULL) {
6407                 /* return caller's zone id */
6408                 return (getzoneid());
6409         }
6410 
6411         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6412         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6413                 kmem_free(kname, ZONENAME_MAX);
6414                 return (set_errno(err));
6415         }
6416 
6417         mutex_enter(&zonehash_lock);
6418         zone = zone_find_all_by_name(kname);
6419         kmem_free(kname, ZONENAME_MAX);
6420         /*
6421          * In a non-global zone, can only lookup global and own name.
6422          * In Trusted Extensions zone label dominance rules apply.
6423          */
6424         if (zone == NULL ||
6425             zone_status_get(zone) < ZONE_IS_READY ||
6426             !zone_list_access(zone)) {
6427                 mutex_exit(&zonehash_lock);
6428                 return (set_errno(EINVAL));
6429         } else {
6430                 zoneid = zone->zone_id;
6431                 mutex_exit(&zonehash_lock);
6432                 return (zoneid);
6433         }
6434 }
6435 
6436 static int
6437 zone_version(int *version_arg)
6438 {
6439         int version = ZONE_SYSCALL_API_VERSION;
6440 
6441         if (copyout(&version, version_arg, sizeof (int)) != 0)
6442                 return (set_errno(EFAULT));
6443         return (0);
6444 }
6445 
6446 /* ARGSUSED */
6447 long
6448 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6449 {
6450         zone_def zs;
6451         int err;
6452 
6453         switch (cmd) {
6454         case ZONE_CREATE:
6455                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6456                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6457                                 return (set_errno(EFAULT));
6458                         }
6459                 } else {
6460 #ifdef _SYSCALL32_IMPL
6461                         zone_def32 zs32;
6462 
6463                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6464                                 return (set_errno(EFAULT));
6465                         }
6466                         zs.zone_name =
6467                             (const char *)(unsigned long)zs32.zone_name;
6468                         zs.zone_root =
6469                             (const char *)(unsigned long)zs32.zone_root;
6470                         zs.zone_privs =
6471                             (const struct priv_set *)
6472                             (unsigned long)zs32.zone_privs;
6473                         zs.zone_privssz = zs32.zone_privssz;
6474                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6475                         zs.rctlbufsz = zs32.rctlbufsz;
6476                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6477                         zs.zfsbufsz = zs32.zfsbufsz;
6478                         zs.extended_error =
6479                             (int *)(unsigned long)zs32.extended_error;
6480                         zs.match = zs32.match;
6481                         zs.doi = zs32.doi;
6482                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6483                         zs.flags = zs32.flags;
6484 #else
6485                         panic("get_udatamodel() returned bogus result\n");
6486 #endif
6487                 }
6488 
6489                 return (zone_create(zs.zone_name, zs.zone_root,
6490                     zs.zone_privs, zs.zone_privssz,
6491                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6492                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6493                     zs.extended_error, zs.match, zs.doi,
6494                     zs.label, zs.flags));
6495         case ZONE_BOOT:
6496                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6497         case ZONE_DESTROY:
6498                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6499         case ZONE_GETATTR:
6500                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6501                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6502         case ZONE_SETATTR:
6503                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6504                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6505         case ZONE_ENTER:
6506                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6507         case ZONE_LIST:
6508                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6509         case ZONE_SHUTDOWN:
6510                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6511         case ZONE_LOOKUP:
6512                 return (zone_lookup((const char *)arg1));
6513         case ZONE_VERSION:
6514                 return (zone_version((int *)arg1));
6515         case ZONE_ADD_DATALINK:
6516                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6517                     (datalink_id_t)(uintptr_t)arg2));
6518         case ZONE_DEL_DATALINK:
6519                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6520                     (datalink_id_t)(uintptr_t)arg2));
6521         case ZONE_CHECK_DATALINK: {
6522                 zoneid_t        zoneid;
6523                 boolean_t       need_copyout;
6524 
6525                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6526                         return (EFAULT);
6527                 need_copyout = (zoneid == ALL_ZONES);
6528                 err = zone_check_datalink(&zoneid,
6529                     (datalink_id_t)(uintptr_t)arg2);
6530                 if (err == 0 && need_copyout) {
6531                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6532                                 err = EFAULT;
6533                 }
6534                 return (err == 0 ? 0 : set_errno(err));
6535         }
6536         case ZONE_LIST_DATALINK:
6537                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6538                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6539         default:
6540                 return (set_errno(EINVAL));
6541         }
6542 }
6543 
6544 struct zarg {
6545         zone_t *zone;
6546         zone_cmd_arg_t arg;
6547 };
6548 
6549 static int
6550 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6551 {
6552         char *buf;
6553         size_t buflen;
6554         int error;
6555 
6556         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6557         buf = kmem_alloc(buflen, KM_SLEEP);
6558         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6559         error = door_ki_open(buf, doorp);
6560         kmem_free(buf, buflen);
6561         return (error);
6562 }
6563 
6564 static void
6565 zone_release_door(door_handle_t *doorp)
6566 {
6567         door_ki_rele(*doorp);
6568         *doorp = NULL;
6569 }
6570 
6571 static void
6572 zone_ki_call_zoneadmd(struct zarg *zargp)
6573 {
6574         door_handle_t door = NULL;
6575         door_arg_t darg, save_arg;
6576         char *zone_name;
6577         size_t zone_namelen;
6578         zoneid_t zoneid;
6579         zone_t *zone;
6580         zone_cmd_arg_t arg;
6581         uint64_t uniqid;
6582         size_t size;
6583         int error;
6584         int retry;
6585 
6586         zone = zargp->zone;
6587         arg = zargp->arg;
6588         kmem_free(zargp, sizeof (*zargp));
6589 
6590         zone_namelen = strlen(zone->zone_name) + 1;
6591         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6592         bcopy(zone->zone_name, zone_name, zone_namelen);
6593         zoneid = zone->zone_id;
6594         uniqid = zone->zone_uniqid;
6595         /*
6596          * zoneadmd may be down, but at least we can empty out the zone.
6597          * We can ignore the return value of zone_empty() since we're called
6598          * from a kernel thread and know we won't be delivered any signals.
6599          */
6600         ASSERT(curproc == &p0);
6601         (void) zone_empty(zone);
6602         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6603         zone_rele(zone);
6604 
6605         size = sizeof (arg);
6606         darg.rbuf = (char *)&arg;
6607         darg.data_ptr = (char *)&arg;
6608         darg.rsize = size;
6609         darg.data_size = size;
6610         darg.desc_ptr = NULL;
6611         darg.desc_num = 0;
6612 
6613         save_arg = darg;
6614         /*
6615          * Since we're not holding a reference to the zone, any number of
6616          * things can go wrong, including the zone disappearing before we get a
6617          * chance to talk to zoneadmd.
6618          */
6619         for (retry = 0; /* forever */; retry++) {
6620                 if (door == NULL &&
6621                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6622                         goto next;
6623                 }
6624                 ASSERT(door != NULL);
6625 
6626                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6627                     SIZE_MAX, 0)) == 0) {
6628                         break;
6629                 }
6630                 switch (error) {
6631                 case EINTR:
6632                         /* FALLTHROUGH */
6633                 case EAGAIN:    /* process may be forking */
6634                         /*
6635                          * Back off for a bit
6636                          */
6637                         break;
6638                 case EBADF:
6639                         zone_release_door(&door);
6640                         if (zone_lookup_door(zone_name, &door) != 0) {
6641                                 /*
6642                                  * zoneadmd may be dead, but it may come back to
6643                                  * life later.
6644                                  */
6645                                 break;
6646                         }
6647                         break;
6648                 default:
6649                         cmn_err(CE_WARN,
6650                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6651                             error);
6652                         goto out;
6653                 }
6654 next:
6655                 /*
6656                  * If this isn't the same zone_t that we originally had in mind,
6657                  * then this is the same as if two kadmin requests come in at
6658                  * the same time: the first one wins.  This means we lose, so we
6659                  * bail.
6660                  */
6661                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6662                         /*
6663                          * Problem is solved.
6664                          */
6665                         break;
6666                 }
6667                 if (zone->zone_uniqid != uniqid) {
6668                         /*
6669                          * zoneid recycled
6670                          */
6671                         zone_rele(zone);
6672                         break;
6673                 }
6674                 /*
6675                  * We could zone_status_timedwait(), but there doesn't seem to
6676                  * be much point in doing that (plus, it would mean that
6677                  * zone_free() isn't called until this thread exits).
6678                  */
6679                 zone_rele(zone);
6680                 delay(hz);
6681                 darg = save_arg;
6682         }
6683 out:
6684         if (door != NULL) {
6685                 zone_release_door(&door);
6686         }
6687         kmem_free(zone_name, zone_namelen);
6688         thread_exit();
6689 }
6690 
6691 /*
6692  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6693  * kadmin().  The caller is a process in the zone.
6694  *
6695  * In order to shutdown the zone, we will hand off control to zoneadmd
6696  * (running in the global zone) via a door.  We do a half-hearted job at
6697  * killing all processes in the zone, create a kernel thread to contact
6698  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6699  * a form of generation number used to let zoneadmd (as well as
6700  * zone_destroy()) know exactly which zone they're re talking about.
6701  */
6702 int
6703 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6704 {
6705         struct zarg *zargp;
6706         zone_cmd_t zcmd;
6707         zone_t *zone;
6708 
6709         zone = curproc->p_zone;
6710         ASSERT(getzoneid() != GLOBAL_ZONEID);
6711 
6712         switch (cmd) {
6713         case A_SHUTDOWN:
6714                 switch (fcn) {
6715                 case AD_HALT:
6716                 case AD_POWEROFF:
6717                         zcmd = Z_HALT;
6718                         break;
6719                 case AD_BOOT:
6720                         zcmd = Z_REBOOT;
6721                         break;
6722                 case AD_IBOOT:
6723                 case AD_SBOOT:
6724                 case AD_SIBOOT:
6725                 case AD_NOSYNC:
6726                         return (ENOTSUP);
6727                 default:
6728                         return (EINVAL);
6729                 }
6730                 break;
6731         case A_REBOOT:
6732                 zcmd = Z_REBOOT;
6733                 break;
6734         case A_FTRACE:
6735         case A_REMOUNT:
6736         case A_FREEZE:
6737         case A_DUMP:
6738         case A_CONFIG:
6739                 return (ENOTSUP);
6740         default:
6741                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6742                 return (EINVAL);
6743         }
6744 
6745         if (secpolicy_zone_admin(credp, B_FALSE))
6746                 return (EPERM);
6747         mutex_enter(&zone_status_lock);
6748 
6749         /*
6750          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6751          * is in the zone.
6752          */
6753         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6754         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6755                 /*
6756                  * This zone is already on its way down.
6757                  */
6758                 mutex_exit(&zone_status_lock);
6759                 return (0);
6760         }
6761         /*
6762          * Prevent future zone_enter()s
6763          */
6764         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6765         mutex_exit(&zone_status_lock);
6766 
6767         /*
6768          * Kill everyone now and call zoneadmd later.
6769          * zone_ki_call_zoneadmd() will do a more thorough job of this
6770          * later.
6771          */
6772         killall(zone->zone_id);
6773         /*
6774          * Now, create the thread to contact zoneadmd and do the rest of the
6775          * work.  This thread can't be created in our zone otherwise
6776          * zone_destroy() would deadlock.
6777          */
6778         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6779         zargp->arg.cmd = zcmd;
6780         zargp->arg.uniqid = zone->zone_uniqid;
6781         zargp->zone = zone;
6782         (void) strcpy(zargp->arg.locale, "C");
6783         /* mdep was already copied in for us by uadmin */
6784         if (mdep != NULL)
6785                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6786                     sizeof (zargp->arg.bootbuf));
6787         zone_hold(zone);
6788 
6789         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6790             TS_RUN, minclsyspri);
6791         exit(CLD_EXITED, 0);
6792 
6793         return (EINVAL);
6794 }
6795 
6796 /*
6797  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6798  * status to ZONE_IS_SHUTTING_DOWN.
6799  *
6800  * This function also shuts down all running zones to ensure that they won't
6801  * fork new processes.
6802  */
6803 void
6804 zone_shutdown_global(void)
6805 {
6806         zone_t *current_zonep;
6807 
6808         ASSERT(INGLOBALZONE(curproc));
6809         mutex_enter(&zonehash_lock);
6810         mutex_enter(&zone_status_lock);
6811 
6812         /* Modify the global zone's status first. */
6813         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6814         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6815 
6816         /*
6817          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6818          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6819          * could cause assertions to fail (e.g., assertions about a zone's
6820          * state during initialization, readying, or booting) or produce races.
6821          * We'll let threads continue to initialize and ready new zones: they'll
6822          * fail to boot the new zones when they see that the global zone is
6823          * shutting down.
6824          */
6825         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6826             current_zonep = list_next(&zone_active, current_zonep)) {
6827                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6828                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6829         }
6830         mutex_exit(&zone_status_lock);
6831         mutex_exit(&zonehash_lock);
6832 }
6833 
6834 /*
6835  * Returns true if the named dataset is visible in the current zone.
6836  * The 'write' parameter is set to 1 if the dataset is also writable.
6837  */
6838 int
6839 zone_dataset_visible(const char *dataset, int *write)
6840 {
6841         static int zfstype = -1;
6842         zone_dataset_t *zd;
6843         size_t len;
6844         zone_t *zone = curproc->p_zone;
6845         const char *name = NULL;
6846         vfs_t *vfsp = NULL;
6847 
6848         if (dataset[0] == '\0')
6849                 return (0);
6850 
6851         /*
6852          * Walk the list once, looking for datasets which match exactly, or
6853          * specify a dataset underneath an exported dataset.  If found, return
6854          * true and note that it is writable.
6855          */
6856         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6857             zd = list_next(&zone->zone_datasets, zd)) {
6858 
6859                 len = strlen(zd->zd_dataset);
6860                 if (strlen(dataset) >= len &&
6861                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6862                     (dataset[len] == '\0' || dataset[len] == '/' ||
6863                     dataset[len] == '@')) {
6864                         if (write)
6865                                 *write = 1;
6866                         return (1);
6867                 }
6868         }
6869 
6870         /*
6871          * Walk the list a second time, searching for datasets which are parents
6872          * of exported datasets.  These should be visible, but read-only.
6873          *
6874          * Note that we also have to support forms such as 'pool/dataset/', with
6875          * a trailing slash.
6876          */
6877         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6878             zd = list_next(&zone->zone_datasets, zd)) {
6879 
6880                 len = strlen(dataset);
6881                 if (dataset[len - 1] == '/')
6882                         len--;  /* Ignore trailing slash */
6883                 if (len < strlen(zd->zd_dataset) &&
6884                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6885                     zd->zd_dataset[len] == '/') {
6886                         if (write)
6887                                 *write = 0;
6888                         return (1);
6889                 }
6890         }
6891 
6892         /*
6893          * We reach here if the given dataset is not found in the zone_dataset
6894          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6895          * instead of delegation. For this we search for the dataset in the
6896          * zone_vfslist of this zone. If found, return true and note that it is
6897          * not writable.
6898          */
6899 
6900         /*
6901          * Initialize zfstype if it is not initialized yet.
6902          */
6903         if (zfstype == -1) {
6904                 struct vfssw *vswp = vfs_getvfssw("zfs");
6905                 zfstype = vswp - vfssw;
6906                 vfs_unrefvfssw(vswp);
6907         }
6908 
6909         vfs_list_read_lock();
6910         vfsp = zone->zone_vfslist;
6911         do {
6912                 ASSERT(vfsp);
6913                 if (vfsp->vfs_fstype == zfstype) {
6914                         name = refstr_value(vfsp->vfs_resource);
6915 
6916                         /*
6917                          * Check if we have an exact match.
6918                          */
6919                         if (strcmp(dataset, name) == 0) {
6920                                 vfs_list_unlock();
6921                                 if (write)
6922                                         *write = 0;
6923                                 return (1);
6924                         }
6925                         /*
6926                          * We need to check if we are looking for parents of
6927                          * a dataset. These should be visible, but read-only.
6928                          */
6929                         len = strlen(dataset);
6930                         if (dataset[len - 1] == '/')
6931                                 len--;
6932 
6933                         if (len < strlen(name) &&
6934                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6935                                 vfs_list_unlock();
6936                                 if (write)
6937                                         *write = 0;
6938                                 return (1);
6939                         }
6940                 }
6941                 vfsp = vfsp->vfs_zone_next;
6942         } while (vfsp != zone->zone_vfslist);
6943 
6944         vfs_list_unlock();
6945         return (0);
6946 }
6947 
6948 /*
6949  * zone_find_by_any_path() -
6950  *
6951  * kernel-private routine similar to zone_find_by_path(), but which
6952  * effectively compares against zone paths rather than zonerootpath
6953  * (i.e., the last component of zonerootpaths, which should be "root/",
6954  * are not compared.)  This is done in order to accurately identify all
6955  * paths, whether zone-visible or not, including those which are parallel
6956  * to /root/, such as /dev/, /home/, etc...
6957  *
6958  * If the specified path does not fall under any zone path then global
6959  * zone is returned.
6960  *
6961  * The treat_abs parameter indicates whether the path should be treated as
6962  * an absolute path although it does not begin with "/".  (This supports
6963  * nfs mount syntax such as host:any/path.)
6964  *
6965  * The caller is responsible for zone_rele of the returned zone.
6966  */
6967 zone_t *
6968 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6969 {
6970         zone_t *zone;
6971         int path_offset = 0;
6972 
6973         if (path == NULL) {
6974                 zone_hold(global_zone);
6975                 return (global_zone);
6976         }
6977 
6978         if (*path != '/') {
6979                 ASSERT(treat_abs);
6980                 path_offset = 1;
6981         }
6982 
6983         mutex_enter(&zonehash_lock);
6984         for (zone = list_head(&zone_active); zone != NULL;
6985             zone = list_next(&zone_active, zone)) {
6986                 char    *c;
6987                 size_t  pathlen;
6988                 char *rootpath_start;
6989 
6990                 if (zone == global_zone)        /* skip global zone */
6991                         continue;
6992 
6993                 /* scan backwards to find start of last component */
6994                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6995                 do {
6996                         c--;
6997                 } while (*c != '/');
6998 
6999                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7000                 rootpath_start = (zone->zone_rootpath + path_offset);
7001                 if (strncmp(path, rootpath_start, pathlen) == 0)
7002                         break;
7003         }
7004         if (zone == NULL)
7005                 zone = global_zone;
7006         zone_hold(zone);
7007         mutex_exit(&zonehash_lock);
7008         return (zone);
7009 }
7010 
7011 /*
7012  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7013  * zone_dl_t pointer if found, and NULL otherwise.
7014  */
7015 static zone_dl_t *
7016 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7017 {
7018         zone_dl_t *zdl;
7019 
7020         ASSERT(mutex_owned(&zone->zone_lock));
7021         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7022             zdl = list_next(&zone->zone_dl_list, zdl)) {
7023                 if (zdl->zdl_id == linkid)
7024                         break;
7025         }
7026         return (zdl);
7027 }
7028 
7029 static boolean_t
7030 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7031 {
7032         boolean_t exists;
7033 
7034         mutex_enter(&zone->zone_lock);
7035         exists = (zone_find_dl(zone, linkid) != NULL);
7036         mutex_exit(&zone->zone_lock);
7037         return (exists);
7038 }
7039 
7040 /*
7041  * Add an data link name for the zone.
7042  */
7043 static int
7044 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7045 {
7046         zone_dl_t *zdl;
7047         zone_t *zone;
7048         zone_t *thiszone;
7049 
7050         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7051                 return (set_errno(ENXIO));
7052 
7053         /* Verify that the datalink ID doesn't already belong to a zone. */
7054         mutex_enter(&zonehash_lock);
7055         for (zone = list_head(&zone_active); zone != NULL;
7056             zone = list_next(&zone_active, zone)) {
7057                 if (zone_dl_exists(zone, linkid)) {
7058                         mutex_exit(&zonehash_lock);
7059                         zone_rele(thiszone);
7060                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7061                 }
7062         }
7063 
7064         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7065         zdl->zdl_id = linkid;
7066         zdl->zdl_net = NULL;
7067         mutex_enter(&thiszone->zone_lock);
7068         list_insert_head(&thiszone->zone_dl_list, zdl);
7069         mutex_exit(&thiszone->zone_lock);
7070         mutex_exit(&zonehash_lock);
7071         zone_rele(thiszone);
7072         return (0);
7073 }
7074 
7075 static int
7076 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7077 {
7078         zone_dl_t *zdl;
7079         zone_t *zone;
7080         int err = 0;
7081 
7082         if ((zone = zone_find_by_id(zoneid)) == NULL)
7083                 return (set_errno(EINVAL));
7084 
7085         mutex_enter(&zone->zone_lock);
7086         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7087                 err = ENXIO;
7088         } else {
7089                 list_remove(&zone->zone_dl_list, zdl);
7090                 nvlist_free(zdl->zdl_net);
7091                 kmem_free(zdl, sizeof (zone_dl_t));
7092         }
7093         mutex_exit(&zone->zone_lock);
7094         zone_rele(zone);
7095         return (err == 0 ? 0 : set_errno(err));
7096 }
7097 
7098 /*
7099  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7100  * the linkid.  Otherwise we just check if the specified zoneidp has been
7101  * assigned the supplied linkid.
7102  */
7103 int
7104 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7105 {
7106         zone_t *zone;
7107         int err = ENXIO;
7108 
7109         if (*zoneidp != ALL_ZONES) {
7110                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7111                         if (zone_dl_exists(zone, linkid))
7112                                 err = 0;
7113                         zone_rele(zone);
7114                 }
7115                 return (err);
7116         }
7117 
7118         mutex_enter(&zonehash_lock);
7119         for (zone = list_head(&zone_active); zone != NULL;
7120             zone = list_next(&zone_active, zone)) {
7121                 if (zone_dl_exists(zone, linkid)) {
7122                         *zoneidp = zone->zone_id;
7123                         err = 0;
7124                         break;
7125                 }
7126         }
7127         mutex_exit(&zonehash_lock);
7128         return (err);
7129 }
7130 
7131 /*
7132  * Get the list of datalink IDs assigned to a zone.
7133  *
7134  * On input, *nump is the number of datalink IDs that can fit in the supplied
7135  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7136  * that were placed in the array if the array was large enough, or to the
7137  * number of datalink IDs that the function needs to place in the array if the
7138  * array is too small.
7139  */
7140 static int
7141 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7142 {
7143         uint_t num, dlcount;
7144         zone_t *zone;
7145         zone_dl_t *zdl;
7146         datalink_id_t *idptr = idarray;
7147 
7148         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7149                 return (set_errno(EFAULT));
7150         if ((zone = zone_find_by_id(zoneid)) == NULL)
7151                 return (set_errno(ENXIO));
7152 
7153         num = 0;
7154         mutex_enter(&zone->zone_lock);
7155         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7156             zdl = list_next(&zone->zone_dl_list, zdl)) {
7157                 /*
7158                  * If the list is bigger than what the caller supplied, just
7159                  * count, don't do copyout.
7160                  */
7161                 if (++num > dlcount)
7162                         continue;
7163                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7164                         mutex_exit(&zone->zone_lock);
7165                         zone_rele(zone);
7166                         return (set_errno(EFAULT));
7167                 }
7168                 idptr++;
7169         }
7170         mutex_exit(&zone->zone_lock);
7171         zone_rele(zone);
7172 
7173         /* Increased or decreased, caller should be notified. */
7174         if (num != dlcount) {
7175                 if (copyout(&num, nump, sizeof (num)) != 0)
7176                         return (set_errno(EFAULT));
7177         }
7178         return (0);
7179 }
7180 
7181 /*
7182  * Public interface for looking up a zone by zoneid. It's a customized version
7183  * for netstack_zone_create(). It can only be called from the zsd create
7184  * callbacks, since it doesn't have reference on the zone structure hence if
7185  * it is called elsewhere the zone could disappear after the zonehash_lock
7186  * is dropped.
7187  *
7188  * Furthermore it
7189  * 1. Doesn't check the status of the zone.
7190  * 2. It will be called even before zone_init is called, in that case the
7191  *    address of zone0 is returned directly, and netstack_zone_create()
7192  *    will only assign a value to zone0.zone_netstack, won't break anything.
7193  * 3. Returns without the zone being held.
7194  */
7195 zone_t *
7196 zone_find_by_id_nolock(zoneid_t zoneid)
7197 {
7198         zone_t *zone;
7199 
7200         mutex_enter(&zonehash_lock);
7201         if (zonehashbyid == NULL)
7202                 zone = &zone0;
7203         else
7204                 zone = zone_find_all_by_id(zoneid);
7205         mutex_exit(&zonehash_lock);
7206         return (zone);
7207 }
7208 
7209 /*
7210  * Walk the datalinks for a given zone
7211  */
7212 int
7213 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7214     void *data)
7215 {
7216         zone_t          *zone;
7217         zone_dl_t       *zdl;
7218         datalink_id_t   *idarray;
7219         uint_t          idcount = 0;
7220         int             i, ret = 0;
7221 
7222         if ((zone = zone_find_by_id(zoneid)) == NULL)
7223                 return (ENOENT);
7224 
7225         /*
7226          * We first build an array of linkid's so that we can walk these and
7227          * execute the callback with the zone_lock dropped.
7228          */
7229         mutex_enter(&zone->zone_lock);
7230         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7231             zdl = list_next(&zone->zone_dl_list, zdl)) {
7232                 idcount++;
7233         }
7234 
7235         if (idcount == 0) {
7236                 mutex_exit(&zone->zone_lock);
7237                 zone_rele(zone);
7238                 return (0);
7239         }
7240 
7241         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7242         if (idarray == NULL) {
7243                 mutex_exit(&zone->zone_lock);
7244                 zone_rele(zone);
7245                 return (ENOMEM);
7246         }
7247 
7248         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7249             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7250                 idarray[i] = zdl->zdl_id;
7251         }
7252 
7253         mutex_exit(&zone->zone_lock);
7254 
7255         for (i = 0; i < idcount && ret == 0; i++) {
7256                 if ((ret = (*cb)(idarray[i], data)) != 0)
7257                         break;
7258         }
7259 
7260         zone_rele(zone);
7261         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7262         return (ret);
7263 }
7264 
7265 static char *
7266 zone_net_type2name(int type)
7267 {
7268         switch (type) {
7269         case ZONE_NETWORK_ADDRESS:
7270                 return (ZONE_NET_ADDRNAME);
7271         case ZONE_NETWORK_DEFROUTER:
7272                 return (ZONE_NET_RTRNAME);
7273         default:
7274                 return (NULL);
7275         }
7276 }
7277 
7278 static int
7279 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7280 {
7281         zone_t *zone;
7282         zone_dl_t *zdl;
7283         nvlist_t *nvl;
7284         int err = 0;
7285         uint8_t *new = NULL;
7286         char *nvname;
7287         int bufsize;
7288         datalink_id_t linkid = znbuf->zn_linkid;
7289 
7290         if (secpolicy_zone_config(CRED()) != 0)
7291                 return (set_errno(EPERM));
7292 
7293         if (zoneid == GLOBAL_ZONEID)
7294                 return (set_errno(EINVAL));
7295 
7296         nvname = zone_net_type2name(znbuf->zn_type);
7297         bufsize = znbuf->zn_len;
7298         new = znbuf->zn_val;
7299         if (nvname == NULL)
7300                 return (set_errno(EINVAL));
7301 
7302         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7303                 return (set_errno(EINVAL));
7304         }
7305 
7306         mutex_enter(&zone->zone_lock);
7307         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7308                 err = ENXIO;
7309                 goto done;
7310         }
7311         if ((nvl = zdl->zdl_net) == NULL) {
7312                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7313                         err = ENOMEM;
7314                         goto done;
7315                 } else {
7316                         zdl->zdl_net = nvl;
7317                 }
7318         }
7319         if (nvlist_exists(nvl, nvname)) {
7320                 err = EINVAL;
7321                 goto done;
7322         }
7323         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7324         ASSERT(err == 0);
7325 done:
7326         mutex_exit(&zone->zone_lock);
7327         zone_rele(zone);
7328         if (err != 0)
7329                 return (set_errno(err));
7330         else
7331                 return (0);
7332 }
7333 
7334 static int
7335 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7336 {
7337         zone_t *zone;
7338         zone_dl_t *zdl;
7339         nvlist_t *nvl;
7340         uint8_t *ptr;
7341         uint_t psize;
7342         int err = 0;
7343         char *nvname;
7344         int bufsize;
7345         void *buf;
7346         datalink_id_t linkid = znbuf->zn_linkid;
7347 
7348         if (zoneid == GLOBAL_ZONEID)
7349                 return (set_errno(EINVAL));
7350 
7351         nvname = zone_net_type2name(znbuf->zn_type);
7352         bufsize = znbuf->zn_len;
7353         buf = znbuf->zn_val;
7354 
7355         if (nvname == NULL)
7356                 return (set_errno(EINVAL));
7357         if ((zone = zone_find_by_id(zoneid)) == NULL)
7358                 return (set_errno(EINVAL));
7359 
7360         mutex_enter(&zone->zone_lock);
7361         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7362                 err = ENXIO;
7363                 goto done;
7364         }
7365         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7366                 err = ENOENT;
7367                 goto done;
7368         }
7369         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7370         ASSERT(err == 0);
7371 
7372         if (psize > bufsize) {
7373                 err = ENOBUFS;
7374                 goto done;
7375         }
7376         znbuf->zn_len = psize;
7377         bcopy(ptr, buf, psize);
7378 done:
7379         mutex_exit(&zone->zone_lock);
7380         zone_rele(zone);
7381         if (err != 0)
7382                 return (set_errno(err));
7383         else
7384                 return (0);
7385 }