illumos-gate New usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013, Joyent Inc. All rights reserved.
  25  * Copyright 2016 Garrett D'Amore
  26  */
  27 
  28 /*
  29  * Zones
  30  *
  31  *   A zone is a named collection of processes, namespace constraints,
  32  *   and other system resources which comprise a secure and manageable
  33  *   application containment facility.
  34  *
  35  *   Zones (represented by the reference counted zone_t) are tracked in
  36  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  37  *   (zoneid_t) are used to track zone association.  Zone IDs are
  38  *   dynamically generated when the zone is created; if a persistent
  39  *   identifier is needed (core files, accounting logs, audit trail,
  40  *   etc.), the zone name should be used.
  41  *
  42  *
  43  *   Global Zone:
  44  *
  45  *   The global zone (zoneid 0) is automatically associated with all
  46  *   system resources that have not been bound to a user-created zone.
  47  *   This means that even systems where zones are not in active use
  48  *   have a global zone, and all processes, mounts, etc. are
  49  *   associated with that zone.  The global zone is generally
  50  *   unconstrained in terms of privileges and access, though the usual
  51  *   credential and privilege based restrictions apply.
  52  *
  53  *
  54  *   Zone States:
  55  *
  56  *   The states in which a zone may be in and the transitions are as
  57  *   follows:
  58  *
  59  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  60  *   initialized zone is added to the list of active zones on the system but
  61  *   isn't accessible.
  62  *
  63  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  64  *   not yet completed. Not possible to enter the zone, but attributes can
  65  *   be retrieved.
  66  *
  67  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  68  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  69  *   executed.  A zone remains in this state until it transitions into
  70  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  71  *
  72  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  73  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  74  *   state.
  75  *
  76  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  77  *   successfully started init.   A zone remains in this state until
  78  *   zone_shutdown() is called.
  79  *
  80  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  81  *   killing all processes running in the zone. The zone remains
  82  *   in this state until there are no more user processes running in the zone.
  83  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  84  *   Since zone_shutdown() is restartable, it may be called successfully
  85  *   multiple times for the same zone_t.  Setting of the zone's state to
  86  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  87  *   the zone's status without worrying about it being a moving target.
  88  *
  89  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  90  *   are no more user processes in the zone.  The zone remains in this
  91  *   state until there are no more kernel threads associated with the
  92  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  93  *   fail.
  94  *
  95  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  96  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  97  *   join the zone or create kernel threads therein.
  98  *
  99  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 100  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 101  *   return NULL from now on.
 102  *
 103  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 104  *   processes or threads doing work on behalf of the zone.  The zone is
 105  *   removed from the list of active zones.  zone_destroy() returns, and
 106  *   the zone can be recreated.
 107  *
 108  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 109  *   callbacks are executed, and all memory associated with the zone is
 110  *   freed.
 111  *
 112  *   Threads can wait for the zone to enter a requested state by using
 113  *   zone_status_wait() or zone_status_timedwait() with the desired
 114  *   state passed in as an argument.  Zone state transitions are
 115  *   uni-directional; it is not possible to move back to an earlier state.
 116  *
 117  *
 118  *   Zone-Specific Data:
 119  *
 120  *   Subsystems needing to maintain zone-specific data can store that
 121  *   data using the ZSD mechanism.  This provides a zone-specific data
 122  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 123  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 124  *   to register callbacks to be invoked when a zone is created, shut
 125  *   down, or destroyed.  This can be used to initialize zone-specific
 126  *   data for new zones and to clean up when zones go away.
 127  *
 128  *
 129  *   Data Structures:
 130  *
 131  *   The per-zone structure (zone_t) is reference counted, and freed
 132  *   when all references are released.  zone_hold and zone_rele can be
 133  *   used to adjust the reference count.  In addition, reference counts
 134  *   associated with the cred_t structure are tracked separately using
 135  *   zone_cred_hold and zone_cred_rele.
 136  *
 137  *   Pointers to active zone_t's are stored in two hash tables; one
 138  *   for searching by id, the other for searching by name.  Lookups
 139  *   can be performed on either basis, using zone_find_by_id and
 140  *   zone_find_by_name.  Both return zone_t pointers with the zone
 141  *   held, so zone_rele should be called when the pointer is no longer
 142  *   needed.  Zones can also be searched by path; zone_find_by_path
 143  *   returns the zone with which a path name is associated (global
 144  *   zone if the path is not within some other zone's file system
 145  *   hierarchy).  This currently requires iterating through each zone,
 146  *   so it is slower than an id or name search via a hash table.
 147  *
 148  *
 149  *   Locking:
 150  *
 151  *   zonehash_lock: This is a top-level global lock used to protect the
 152  *       zone hash tables and lists.  Zones cannot be created or destroyed
 153  *       while this lock is held.
 154  *   zone_status_lock: This is a global lock protecting zone state.
 155  *       Zones cannot change state while this lock is held.  It also
 156  *       protects the list of kernel threads associated with a zone.
 157  *   zone_lock: This is a per-zone lock used to protect several fields of
 158  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 159  *       this lock means that the zone cannot go away.
 160  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 161  *       related to the zone.max-lwps rctl.
 162  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 163  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 164  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 165  *       currently just max_lofi
 166  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 167  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 168  *       list (a list of zones in the ZONE_IS_DEAD state).
 169  *
 170  *   Ordering requirements:
 171  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 172  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 173  *
 174  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 177  *
 178  *   Blocking memory allocations are permitted while holding any of the
 179  *   zone locks.
 180  *
 181  *
 182  *   System Call Interface:
 183  *
 184  *   The zone subsystem can be managed and queried from user level with
 185  *   the following system calls (all subcodes of the primary "zone"
 186  *   system call):
 187  *   - zone_create: creates a zone with selected attributes (name,
 188  *     root path, privileges, resource controls, ZFS datasets)
 189  *   - zone_enter: allows the current process to enter a zone
 190  *   - zone_getattr: reports attributes of a zone
 191  *   - zone_setattr: set attributes of a zone
 192  *   - zone_boot: set 'init' running for the zone
 193  *   - zone_list: lists all zones active in the system
 194  *   - zone_lookup: looks up zone id based on name
 195  *   - zone_shutdown: initiates shutdown process (see states above)
 196  *   - zone_destroy: completes shutdown process (see states above)
 197  *
 198  */
 199 
 200 #include <sys/priv_impl.h>
 201 #include <sys/cred.h>
 202 #include <c2/audit.h>
 203 #include <sys/debug.h>
 204 #include <sys/file.h>
 205 #include <sys/kmem.h>
 206 #include <sys/kstat.h>
 207 #include <sys/mutex.h>
 208 #include <sys/note.h>
 209 #include <sys/pathname.h>
 210 #include <sys/proc.h>
 211 #include <sys/project.h>
 212 #include <sys/sysevent.h>
 213 #include <sys/task.h>
 214 #include <sys/systm.h>
 215 #include <sys/types.h>
 216 #include <sys/utsname.h>
 217 #include <sys/vnode.h>
 218 #include <sys/vfs.h>
 219 #include <sys/systeminfo.h>
 220 #include <sys/policy.h>
 221 #include <sys/cred_impl.h>
 222 #include <sys/contract_impl.h>
 223 #include <sys/contract/process_impl.h>
 224 #include <sys/class.h>
 225 #include <sys/pool.h>
 226 #include <sys/pool_pset.h>
 227 #include <sys/pset.h>
 228 #include <sys/strlog.h>
 229 #include <sys/sysmacros.h>
 230 #include <sys/callb.h>
 231 #include <sys/vmparam.h>
 232 #include <sys/corectl.h>
 233 #include <sys/ipc_impl.h>
 234 #include <sys/klpd.h>
 235 
 236 #include <sys/door.h>
 237 #include <sys/cpuvar.h>
 238 #include <sys/sdt.h>
 239 
 240 #include <sys/uadmin.h>
 241 #include <sys/session.h>
 242 #include <sys/cmn_err.h>
 243 #include <sys/modhash.h>
 244 #include <sys/sunddi.h>
 245 #include <sys/nvpair.h>
 246 #include <sys/rctl.h>
 247 #include <sys/fss.h>
 248 #include <sys/brand.h>
 249 #include <sys/zone.h>
 250 #include <net/if.h>
 251 #include <sys/cpucaps.h>
 252 #include <vm/seg.h>
 253 #include <sys/mac.h>
 254 
 255 /*
 256  * This constant specifies the number of seconds that threads waiting for
 257  * subsystems to release a zone's general-purpose references will wait before
 258  * they log the zone's reference counts.  The constant's value shouldn't
 259  * be so small that reference counts are unnecessarily reported for zones
 260  * whose references are slowly released.  On the other hand, it shouldn't be so
 261  * large that users reboot their systems out of frustration over hung zones
 262  * before the system logs the zones' reference counts.
 263  */
 264 #define ZONE_DESTROY_TIMEOUT_SECS       60
 265 
 266 /* List of data link IDs which are accessible from the zone */
 267 typedef struct zone_dl {
 268         datalink_id_t   zdl_id;
 269         nvlist_t        *zdl_net;
 270         list_node_t     zdl_linkage;
 271 } zone_dl_t;
 272 
 273 /*
 274  * cv used to signal that all references to the zone have been released.  This
 275  * needs to be global since there may be multiple waiters, and the first to
 276  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 277  */
 278 static kcondvar_t zone_destroy_cv;
 279 /*
 280  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 281  * but then we'd need another lock for zone_destroy_cv, and why bother?
 282  */
 283 static kmutex_t zone_status_lock;
 284 
 285 /*
 286  * ZSD-related global variables.
 287  */
 288 static kmutex_t zsd_key_lock;   /* protects the following two */
 289 /*
 290  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 291  */
 292 static zone_key_t zsd_keyval = 0;
 293 /*
 294  * Global list of registered keys.  We use this when a new zone is created.
 295  */
 296 static list_t zsd_registered_keys;
 297 
 298 int zone_hash_size = 256;
 299 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 300 static kmutex_t zonehash_lock;
 301 static uint_t zonecount;
 302 static id_space_t *zoneid_space;
 303 
 304 /*
 305  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 306  * kernel proper runs, and which manages all other zones.
 307  *
 308  * Although not declared as static, the variable "zone0" should not be used
 309  * except for by code that needs to reference the global zone early on in boot,
 310  * before it is fully initialized.  All other consumers should use
 311  * 'global_zone'.
 312  */
 313 zone_t zone0;
 314 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 315 
 316 /*
 317  * List of active zones, protected by zonehash_lock.
 318  */
 319 static list_t zone_active;
 320 
 321 /*
 322  * List of destroyed zones that still have outstanding cred references.
 323  * Used for debugging.  Uses a separate lock to avoid lock ordering
 324  * problems in zone_free.
 325  */
 326 static list_t zone_deathrow;
 327 static kmutex_t zone_deathrow_lock;
 328 
 329 /* number of zones is limited by virtual interface limit in IP */
 330 uint_t maxzones = 8192;
 331 
 332 /* Event channel to sent zone state change notifications */
 333 evchan_t *zone_event_chan;
 334 
 335 /*
 336  * This table holds the mapping from kernel zone states to
 337  * states visible in the state notification API.
 338  * The idea is that we only expose "obvious" states and
 339  * do not expose states which are just implementation details.
 340  */
 341 const char  *zone_status_table[] = {
 342         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 343         ZONE_EVENT_INITIALIZED,         /* initialized */
 344         ZONE_EVENT_READY,               /* ready */
 345         ZONE_EVENT_READY,               /* booting */
 346         ZONE_EVENT_RUNNING,             /* running */
 347         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 351         ZONE_EVENT_UNINITIALIZED,       /* dead */
 352 };
 353 
 354 /*
 355  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 356  * (see sys/zone.h).
 357  */
 358 static char *zone_ref_subsys_names[] = {
 359         "NFS",          /* ZONE_REF_NFS */
 360         "NFSv4",        /* ZONE_REF_NFSV4 */
 361         "SMBFS",        /* ZONE_REF_SMBFS */
 362         "MNTFS",        /* ZONE_REF_MNTFS */
 363         "LOFI",         /* ZONE_REF_LOFI */
 364         "VFS",          /* ZONE_REF_VFS */
 365         "IPC"           /* ZONE_REF_IPC */
 366 };
 367 
 368 /*
 369  * This isn't static so lint doesn't complain.
 370  */
 371 rctl_hndl_t rc_zone_cpu_shares;
 372 rctl_hndl_t rc_zone_locked_mem;
 373 rctl_hndl_t rc_zone_max_swap;
 374 rctl_hndl_t rc_zone_max_lofi;
 375 rctl_hndl_t rc_zone_cpu_cap;
 376 rctl_hndl_t rc_zone_nlwps;
 377 rctl_hndl_t rc_zone_nprocs;
 378 rctl_hndl_t rc_zone_shmmax;
 379 rctl_hndl_t rc_zone_shmmni;
 380 rctl_hndl_t rc_zone_semmni;
 381 rctl_hndl_t rc_zone_msgmni;
 382 
 383 const char * const zone_default_initname = "/sbin/init";
 384 static char * const zone_prefix = "/zone/";
 385 static int zone_shutdown(zoneid_t zoneid);
 386 static int zone_add_datalink(zoneid_t, datalink_id_t);
 387 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 388 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 389 static int zone_set_network(zoneid_t, zone_net_data_t *);
 390 static int zone_get_network(zoneid_t, zone_net_data_t *);
 391 
 392 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 393 
 394 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 395 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 396 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 397 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 398     zone_key_t);
 399 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 400 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 401     kmutex_t *);
 402 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 403     kmutex_t *);
 404 
 405 /*
 406  * Bump this number when you alter the zone syscall interfaces; this is
 407  * because we need to have support for previous API versions in libc
 408  * to support patching; libc calls into the kernel to determine this number.
 409  *
 410  * Version 1 of the API is the version originally shipped with Solaris 10
 411  * Version 2 alters the zone_create system call in order to support more
 412  *     arguments by moving the args into a structure; and to do better
 413  *     error reporting when zone_create() fails.
 414  * Version 3 alters the zone_create system call in order to support the
 415  *     import of ZFS datasets to zones.
 416  * Version 4 alters the zone_create system call in order to support
 417  *     Trusted Extensions.
 418  * Version 5 alters the zone_boot system call, and converts its old
 419  *     bootargs parameter to be set by the zone_setattr API instead.
 420  * Version 6 adds the flag argument to zone_create.
 421  */
 422 static const int ZONE_SYSCALL_API_VERSION = 6;
 423 
 424 /*
 425  * Certain filesystems (such as NFS and autofs) need to know which zone
 426  * the mount is being placed in.  Because of this, we need to be able to
 427  * ensure that a zone isn't in the process of being created/destroyed such
 428  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 429  * it gets added the list of mounted zones, it ends up on the wrong zone's
 430  * mount list. Since a zone can't reside on an NFS file system, we don't
 431  * have to worry about the zonepath itself.
 432  *
 433  * The following functions: block_mounts()/resume_mounts() and
 434  * mount_in_progress()/mount_completed() are used by zones and the VFS
 435  * layer (respectively) to synchronize zone state transitions and new
 436  * mounts within a zone. This syncronization is on a per-zone basis, so
 437  * activity for one zone will not interfere with activity for another zone.
 438  *
 439  * The semantics are like a reader-reader lock such that there may
 440  * either be multiple mounts (or zone state transitions, if that weren't
 441  * serialized by zonehash_lock) in progress at the same time, but not
 442  * both.
 443  *
 444  * We use cv's so the user can ctrl-C out of the operation if it's
 445  * taking too long.
 446  *
 447  * The semantics are such that there is unfair bias towards the
 448  * "current" operation.  This means that zone halt may starve if
 449  * there is a rapid succession of new mounts coming in to the zone.
 450  */
 451 /*
 452  * Prevent new mounts from progressing to the point of calling
 453  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 454  * them to complete.
 455  */
 456 static int
 457 block_mounts(zone_t *zp)
 458 {
 459         int retval = 0;
 460 
 461         /*
 462          * Since it may block for a long time, block_mounts() shouldn't be
 463          * called with zonehash_lock held.
 464          */
 465         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 466         mutex_enter(&zp->zone_mount_lock);
 467         while (zp->zone_mounts_in_progress > 0) {
 468                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 469                         goto signaled;
 470         }
 471         /*
 472          * A negative value of mounts_in_progress indicates that mounts
 473          * have been blocked by (-mounts_in_progress) different callers
 474          * (remotely possible if two threads enter zone_shutdown at the same
 475          * time).
 476          */
 477         zp->zone_mounts_in_progress--;
 478         retval = 1;
 479 signaled:
 480         mutex_exit(&zp->zone_mount_lock);
 481         return (retval);
 482 }
 483 
 484 /*
 485  * The VFS layer may progress with new mounts as far as we're concerned.
 486  * Allow them to progress if we were the last obstacle.
 487  */
 488 static void
 489 resume_mounts(zone_t *zp)
 490 {
 491         mutex_enter(&zp->zone_mount_lock);
 492         if (++zp->zone_mounts_in_progress == 0)
 493                 cv_broadcast(&zp->zone_mount_cv);
 494         mutex_exit(&zp->zone_mount_lock);
 495 }
 496 
 497 /*
 498  * The VFS layer is busy with a mount; this zone should wait until all
 499  * of its mounts are completed to progress.
 500  */
 501 void
 502 mount_in_progress(zone_t *zp)
 503 {
 504         mutex_enter(&zp->zone_mount_lock);
 505         while (zp->zone_mounts_in_progress < 0)
 506                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 507         zp->zone_mounts_in_progress++;
 508         mutex_exit(&zp->zone_mount_lock);
 509 }
 510 
 511 /*
 512  * VFS is done with one mount; wake up any waiting block_mounts()
 513  * callers if this is the last mount.
 514  */
 515 void
 516 mount_completed(zone_t *zp)
 517 {
 518         mutex_enter(&zp->zone_mount_lock);
 519         if (--zp->zone_mounts_in_progress == 0)
 520                 cv_broadcast(&zp->zone_mount_cv);
 521         mutex_exit(&zp->zone_mount_lock);
 522 }
 523 
 524 /*
 525  * ZSD routines.
 526  *
 527  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 528  * defined by the pthread_key_create() and related interfaces.
 529  *
 530  * Kernel subsystems may register one or more data items and/or
 531  * callbacks to be executed when a zone is created, shutdown, or
 532  * destroyed.
 533  *
 534  * Unlike the thread counterpart, destructor callbacks will be executed
 535  * even if the data pointer is NULL and/or there are no constructor
 536  * callbacks, so it is the responsibility of such callbacks to check for
 537  * NULL data values if necessary.
 538  *
 539  * The locking strategy and overall picture is as follows:
 540  *
 541  * When someone calls zone_key_create(), a template ZSD entry is added to the
 542  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 543  * holding that lock all the existing zones are marked as
 544  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 545  * zone_zsd list (protected by zone_lock). The global list is updated first
 546  * (under zone_key_lock) to make sure that newly created zones use the
 547  * most recent list of keys. Then under zonehash_lock we walk the zones
 548  * and mark them.  Similar locking is used in zone_key_delete().
 549  *
 550  * The actual create, shutdown, and destroy callbacks are done without
 551  * holding any lock. And zsd_flags are used to ensure that the operations
 552  * completed so that when zone_key_create (and zone_create) is done, as well as
 553  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 554  * are completed.
 555  *
 556  * When new zones are created constructor callbacks for all registered ZSD
 557  * entries will be called. That also uses the above two phases of marking
 558  * what needs to be done, and then running the callbacks without holding
 559  * any locks.
 560  *
 561  * The framework does not provide any locking around zone_getspecific() and
 562  * zone_setspecific() apart from that needed for internal consistency, so
 563  * callers interested in atomic "test-and-set" semantics will need to provide
 564  * their own locking.
 565  */
 566 
 567 /*
 568  * Helper function to find the zsd_entry associated with the key in the
 569  * given list.
 570  */
 571 static struct zsd_entry *
 572 zsd_find(list_t *l, zone_key_t key)
 573 {
 574         struct zsd_entry *zsd;
 575 
 576         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 577                 if (zsd->zsd_key == key) {
 578                         return (zsd);
 579                 }
 580         }
 581         return (NULL);
 582 }
 583 
 584 /*
 585  * Helper function to find the zsd_entry associated with the key in the
 586  * given list. Move it to the front of the list.
 587  */
 588 static struct zsd_entry *
 589 zsd_find_mru(list_t *l, zone_key_t key)
 590 {
 591         struct zsd_entry *zsd;
 592 
 593         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 594                 if (zsd->zsd_key == key) {
 595                         /*
 596                          * Move to head of list to keep list in MRU order.
 597                          */
 598                         if (zsd != list_head(l)) {
 599                                 list_remove(l, zsd);
 600                                 list_insert_head(l, zsd);
 601                         }
 602                         return (zsd);
 603                 }
 604         }
 605         return (NULL);
 606 }
 607 
 608 void
 609 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 610     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 611 {
 612         struct zsd_entry *zsdp;
 613         struct zsd_entry *t;
 614         struct zone *zone;
 615         zone_key_t  key;
 616 
 617         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 618         zsdp->zsd_data = NULL;
 619         zsdp->zsd_create = create;
 620         zsdp->zsd_shutdown = shutdown;
 621         zsdp->zsd_destroy = destroy;
 622 
 623         /*
 624          * Insert in global list of callbacks. Makes future zone creations
 625          * see it.
 626          */
 627         mutex_enter(&zsd_key_lock);
 628         key = zsdp->zsd_key = ++zsd_keyval;
 629         ASSERT(zsd_keyval != 0);
 630         list_insert_tail(&zsd_registered_keys, zsdp);
 631         mutex_exit(&zsd_key_lock);
 632 
 633         /*
 634          * Insert for all existing zones and mark them as needing
 635          * a create callback.
 636          */
 637         mutex_enter(&zonehash_lock);        /* stop the world */
 638         for (zone = list_head(&zone_active); zone != NULL;
 639             zone = list_next(&zone_active, zone)) {
 640                 zone_status_t status;
 641 
 642                 mutex_enter(&zone->zone_lock);
 643 
 644                 /* Skip zones that are on the way down or not yet up */
 645                 status = zone_status_get(zone);
 646                 if (status >= ZONE_IS_DOWN ||
 647                     status == ZONE_IS_UNINITIALIZED) {
 648                         mutex_exit(&zone->zone_lock);
 649                         continue;
 650                 }
 651 
 652                 t = zsd_find_mru(&zone->zone_zsd, key);
 653                 if (t != NULL) {
 654                         /*
 655                          * A zsd_configure already inserted it after
 656                          * we dropped zsd_key_lock above.
 657                          */
 658                         mutex_exit(&zone->zone_lock);
 659                         continue;
 660                 }
 661                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 662                 t->zsd_key = key;
 663                 t->zsd_create = create;
 664                 t->zsd_shutdown = shutdown;
 665                 t->zsd_destroy = destroy;
 666                 if (create != NULL) {
 667                         t->zsd_flags = ZSD_CREATE_NEEDED;
 668                         DTRACE_PROBE2(zsd__create__needed,
 669                             zone_t *, zone, zone_key_t, key);
 670                 }
 671                 list_insert_tail(&zone->zone_zsd, t);
 672                 mutex_exit(&zone->zone_lock);
 673         }
 674         mutex_exit(&zonehash_lock);
 675 
 676         if (create != NULL) {
 677                 /* Now call the create callback for this key */
 678                 zsd_apply_all_zones(zsd_apply_create, key);
 679         }
 680         /*
 681          * It is safe for consumers to use the key now, make it
 682          * globally visible. Specifically zone_getspecific() will
 683          * always successfully return the zone specific data associated
 684          * with the key.
 685          */
 686         *keyp = key;
 687 
 688 }
 689 
 690 /*
 691  * Function called when a module is being unloaded, or otherwise wishes
 692  * to unregister its ZSD key and callbacks.
 693  *
 694  * Remove from the global list and determine the functions that need to
 695  * be called under a global lock. Then call the functions without
 696  * holding any locks. Finally free up the zone_zsd entries. (The apply
 697  * functions need to access the zone_zsd entries to find zsd_data etc.)
 698  */
 699 int
 700 zone_key_delete(zone_key_t key)
 701 {
 702         struct zsd_entry *zsdp = NULL;
 703         zone_t *zone;
 704 
 705         mutex_enter(&zsd_key_lock);
 706         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 707         if (zsdp == NULL) {
 708                 mutex_exit(&zsd_key_lock);
 709                 return (-1);
 710         }
 711         list_remove(&zsd_registered_keys, zsdp);
 712         mutex_exit(&zsd_key_lock);
 713 
 714         mutex_enter(&zonehash_lock);
 715         for (zone = list_head(&zone_active); zone != NULL;
 716             zone = list_next(&zone_active, zone)) {
 717                 struct zsd_entry *del;
 718 
 719                 mutex_enter(&zone->zone_lock);
 720                 del = zsd_find_mru(&zone->zone_zsd, key);
 721                 if (del == NULL) {
 722                         /*
 723                          * Somebody else got here first e.g the zone going
 724                          * away.
 725                          */
 726                         mutex_exit(&zone->zone_lock);
 727                         continue;
 728                 }
 729                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 730                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 731                 if (del->zsd_shutdown != NULL &&
 732                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 733                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 734                         DTRACE_PROBE2(zsd__shutdown__needed,
 735                             zone_t *, zone, zone_key_t, key);
 736                 }
 737                 if (del->zsd_destroy != NULL &&
 738                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 739                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 740                         DTRACE_PROBE2(zsd__destroy__needed,
 741                             zone_t *, zone, zone_key_t, key);
 742                 }
 743                 mutex_exit(&zone->zone_lock);
 744         }
 745         mutex_exit(&zonehash_lock);
 746         kmem_free(zsdp, sizeof (*zsdp));
 747 
 748         /* Now call the shutdown and destroy callback for this key */
 749         zsd_apply_all_zones(zsd_apply_shutdown, key);
 750         zsd_apply_all_zones(zsd_apply_destroy, key);
 751 
 752         /* Now we can free up the zsdp structures in each zone */
 753         mutex_enter(&zonehash_lock);
 754         for (zone = list_head(&zone_active); zone != NULL;
 755             zone = list_next(&zone_active, zone)) {
 756                 struct zsd_entry *del;
 757 
 758                 mutex_enter(&zone->zone_lock);
 759                 del = zsd_find(&zone->zone_zsd, key);
 760                 if (del != NULL) {
 761                         list_remove(&zone->zone_zsd, del);
 762                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 763                         kmem_free(del, sizeof (*del));
 764                 }
 765                 mutex_exit(&zone->zone_lock);
 766         }
 767         mutex_exit(&zonehash_lock);
 768 
 769         return (0);
 770 }
 771 
 772 /*
 773  * ZSD counterpart of pthread_setspecific().
 774  *
 775  * Since all zsd callbacks, including those with no create function,
 776  * have an entry in zone_zsd, if the key is registered it is part of
 777  * the zone_zsd list.
 778  * Return an error if the key wasn't registerd.
 779  */
 780 int
 781 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 782 {
 783         struct zsd_entry *t;
 784 
 785         mutex_enter(&zone->zone_lock);
 786         t = zsd_find_mru(&zone->zone_zsd, key);
 787         if (t != NULL) {
 788                 /*
 789                  * Replace old value with new
 790                  */
 791                 t->zsd_data = (void *)data;
 792                 mutex_exit(&zone->zone_lock);
 793                 return (0);
 794         }
 795         mutex_exit(&zone->zone_lock);
 796         return (-1);
 797 }
 798 
 799 /*
 800  * ZSD counterpart of pthread_getspecific().
 801  */
 802 void *
 803 zone_getspecific(zone_key_t key, zone_t *zone)
 804 {
 805         struct zsd_entry *t;
 806         void *data;
 807 
 808         mutex_enter(&zone->zone_lock);
 809         t = zsd_find_mru(&zone->zone_zsd, key);
 810         data = (t == NULL ? NULL : t->zsd_data);
 811         mutex_exit(&zone->zone_lock);
 812         return (data);
 813 }
 814 
 815 /*
 816  * Function used to initialize a zone's list of ZSD callbacks and data
 817  * when the zone is being created.  The callbacks are initialized from
 818  * the template list (zsd_registered_keys). The constructor callback is
 819  * executed later (once the zone exists and with locks dropped).
 820  */
 821 static void
 822 zone_zsd_configure(zone_t *zone)
 823 {
 824         struct zsd_entry *zsdp;
 825         struct zsd_entry *t;
 826 
 827         ASSERT(MUTEX_HELD(&zonehash_lock));
 828         ASSERT(list_head(&zone->zone_zsd) == NULL);
 829         mutex_enter(&zone->zone_lock);
 830         mutex_enter(&zsd_key_lock);
 831         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 832             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 833                 /*
 834                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 835                  * should not have added anything to it.
 836                  */
 837                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 838 
 839                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 840                 t->zsd_key = zsdp->zsd_key;
 841                 t->zsd_create = zsdp->zsd_create;
 842                 t->zsd_shutdown = zsdp->zsd_shutdown;
 843                 t->zsd_destroy = zsdp->zsd_destroy;
 844                 if (zsdp->zsd_create != NULL) {
 845                         t->zsd_flags = ZSD_CREATE_NEEDED;
 846                         DTRACE_PROBE2(zsd__create__needed,
 847                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 848                 }
 849                 list_insert_tail(&zone->zone_zsd, t);
 850         }
 851         mutex_exit(&zsd_key_lock);
 852         mutex_exit(&zone->zone_lock);
 853 }
 854 
 855 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 856 
 857 /*
 858  * Helper function to execute shutdown or destructor callbacks.
 859  */
 860 static void
 861 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 862 {
 863         struct zsd_entry *t;
 864 
 865         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 866         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 867         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 868 
 869         /*
 870          * Run the callback solely based on what is registered for the zone
 871          * in zone_zsd. The global list can change independently of this
 872          * as keys are registered and unregistered and we don't register new
 873          * callbacks for a zone that is in the process of going away.
 874          */
 875         mutex_enter(&zone->zone_lock);
 876         for (t = list_head(&zone->zone_zsd); t != NULL;
 877             t = list_next(&zone->zone_zsd, t)) {
 878                 zone_key_t key = t->zsd_key;
 879 
 880                 /* Skip if no callbacks registered */
 881 
 882                 if (ct == ZSD_SHUTDOWN) {
 883                         if (t->zsd_shutdown != NULL &&
 884                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 885                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 886                                 DTRACE_PROBE2(zsd__shutdown__needed,
 887                                     zone_t *, zone, zone_key_t, key);
 888                         }
 889                 } else {
 890                         if (t->zsd_destroy != NULL &&
 891                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 892                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 893                                 DTRACE_PROBE2(zsd__destroy__needed,
 894                                     zone_t *, zone, zone_key_t, key);
 895                         }
 896                 }
 897         }
 898         mutex_exit(&zone->zone_lock);
 899 
 900         /* Now call the shutdown and destroy callback for this key */
 901         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 902         zsd_apply_all_keys(zsd_apply_destroy, zone);
 903 
 904 }
 905 
 906 /*
 907  * Called when the zone is going away; free ZSD-related memory, and
 908  * destroy the zone_zsd list.
 909  */
 910 static void
 911 zone_free_zsd(zone_t *zone)
 912 {
 913         struct zsd_entry *t, *next;
 914 
 915         /*
 916          * Free all the zsd_entry's we had on this zone.
 917          */
 918         mutex_enter(&zone->zone_lock);
 919         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 920                 next = list_next(&zone->zone_zsd, t);
 921                 list_remove(&zone->zone_zsd, t);
 922                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 923                 kmem_free(t, sizeof (*t));
 924         }
 925         list_destroy(&zone->zone_zsd);
 926         mutex_exit(&zone->zone_lock);
 927 
 928 }
 929 
 930 /*
 931  * Apply a function to all zones for particular key value.
 932  *
 933  * The applyfn has to drop zonehash_lock if it does some work, and
 934  * then reacquire it before it returns.
 935  * When the lock is dropped we don't follow list_next even
 936  * if it is possible to do so without any hazards. This is
 937  * because we want the design to allow for the list of zones
 938  * to change in any arbitrary way during the time the
 939  * lock was dropped.
 940  *
 941  * It is safe to restart the loop at list_head since the applyfn
 942  * changes the zsd_flags as it does work, so a subsequent
 943  * pass through will have no effect in applyfn, hence the loop will terminate
 944  * in at worst O(N^2).
 945  */
 946 static void
 947 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 948 {
 949         zone_t *zone;
 950 
 951         mutex_enter(&zonehash_lock);
 952         zone = list_head(&zone_active);
 953         while (zone != NULL) {
 954                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 955                         /* Lock dropped - restart at head */
 956                         zone = list_head(&zone_active);
 957                 } else {
 958                         zone = list_next(&zone_active, zone);
 959                 }
 960         }
 961         mutex_exit(&zonehash_lock);
 962 }
 963 
 964 /*
 965  * Apply a function to all keys for a particular zone.
 966  *
 967  * The applyfn has to drop zonehash_lock if it does some work, and
 968  * then reacquire it before it returns.
 969  * When the lock is dropped we don't follow list_next even
 970  * if it is possible to do so without any hazards. This is
 971  * because we want the design to allow for the list of zsd callbacks
 972  * to change in any arbitrary way during the time the
 973  * lock was dropped.
 974  *
 975  * It is safe to restart the loop at list_head since the applyfn
 976  * changes the zsd_flags as it does work, so a subsequent
 977  * pass through will have no effect in applyfn, hence the loop will terminate
 978  * in at worst O(N^2).
 979  */
 980 static void
 981 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 982 {
 983         struct zsd_entry *t;
 984 
 985         mutex_enter(&zone->zone_lock);
 986         t = list_head(&zone->zone_zsd);
 987         while (t != NULL) {
 988                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 989                         /* Lock dropped - restart at head */
 990                         t = list_head(&zone->zone_zsd);
 991                 } else {
 992                         t = list_next(&zone->zone_zsd, t);
 993                 }
 994         }
 995         mutex_exit(&zone->zone_lock);
 996 }
 997 
 998 /*
 999  * Call the create function for the zone and key if CREATE_NEEDED
1000  * is set.
1001  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1002  * we wait for that thread to complete so that we can ensure that
1003  * all the callbacks are done when we've looped over all zones/keys.
1004  *
1005  * When we call the create function, we drop the global held by the
1006  * caller, and return true to tell the caller it needs to re-evalute the
1007  * state.
1008  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1009  * remains held on exit.
1010  */
1011 static boolean_t
1012 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1013     zone_t *zone, zone_key_t key)
1014 {
1015         void *result;
1016         struct zsd_entry *t;
1017         boolean_t dropped;
1018 
1019         if (lockp != NULL) {
1020                 ASSERT(MUTEX_HELD(lockp));
1021         }
1022         if (zone_lock_held) {
1023                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1024         } else {
1025                 mutex_enter(&zone->zone_lock);
1026         }
1027 
1028         t = zsd_find(&zone->zone_zsd, key);
1029         if (t == NULL) {
1030                 /*
1031                  * Somebody else got here first e.g the zone going
1032                  * away.
1033                  */
1034                 if (!zone_lock_held)
1035                         mutex_exit(&zone->zone_lock);
1036                 return (B_FALSE);
1037         }
1038         dropped = B_FALSE;
1039         if (zsd_wait_for_inprogress(zone, t, lockp))
1040                 dropped = B_TRUE;
1041 
1042         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1043                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1044                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1045                 DTRACE_PROBE2(zsd__create__inprogress,
1046                     zone_t *, zone, zone_key_t, key);
1047                 mutex_exit(&zone->zone_lock);
1048                 if (lockp != NULL)
1049                         mutex_exit(lockp);
1050 
1051                 dropped = B_TRUE;
1052                 ASSERT(t->zsd_create != NULL);
1053                 DTRACE_PROBE2(zsd__create__start,
1054                     zone_t *, zone, zone_key_t, key);
1055 
1056                 result = (*t->zsd_create)(zone->zone_id);
1057 
1058                 DTRACE_PROBE2(zsd__create__end,
1059                     zone_t *, zone, voidn *, result);
1060 
1061                 ASSERT(result != NULL);
1062                 if (lockp != NULL)
1063                         mutex_enter(lockp);
1064                 mutex_enter(&zone->zone_lock);
1065                 t->zsd_data = result;
1066                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1067                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1068                 cv_broadcast(&t->zsd_cv);
1069                 DTRACE_PROBE2(zsd__create__completed,
1070                     zone_t *, zone, zone_key_t, key);
1071         }
1072         if (!zone_lock_held)
1073                 mutex_exit(&zone->zone_lock);
1074         return (dropped);
1075 }
1076 
1077 /*
1078  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1079  * is set.
1080  * If some other thread gets here first and sets *_INPROGRESS, then
1081  * we wait for that thread to complete so that we can ensure that
1082  * all the callbacks are done when we've looped over all zones/keys.
1083  *
1084  * When we call the shutdown function, we drop the global held by the
1085  * caller, and return true to tell the caller it needs to re-evalute the
1086  * state.
1087  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1088  * remains held on exit.
1089  */
1090 static boolean_t
1091 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1092     zone_t *zone, zone_key_t key)
1093 {
1094         struct zsd_entry *t;
1095         void *data;
1096         boolean_t dropped;
1097 
1098         if (lockp != NULL) {
1099                 ASSERT(MUTEX_HELD(lockp));
1100         }
1101         if (zone_lock_held) {
1102                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1103         } else {
1104                 mutex_enter(&zone->zone_lock);
1105         }
1106 
1107         t = zsd_find(&zone->zone_zsd, key);
1108         if (t == NULL) {
1109                 /*
1110                  * Somebody else got here first e.g the zone going
1111                  * away.
1112                  */
1113                 if (!zone_lock_held)
1114                         mutex_exit(&zone->zone_lock);
1115                 return (B_FALSE);
1116         }
1117         dropped = B_FALSE;
1118         if (zsd_wait_for_creator(zone, t, lockp))
1119                 dropped = B_TRUE;
1120 
1121         if (zsd_wait_for_inprogress(zone, t, lockp))
1122                 dropped = B_TRUE;
1123 
1124         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1125                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1126                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1127                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1128                     zone_t *, zone, zone_key_t, key);
1129                 mutex_exit(&zone->zone_lock);
1130                 if (lockp != NULL)
1131                         mutex_exit(lockp);
1132                 dropped = B_TRUE;
1133 
1134                 ASSERT(t->zsd_shutdown != NULL);
1135                 data = t->zsd_data;
1136 
1137                 DTRACE_PROBE2(zsd__shutdown__start,
1138                     zone_t *, zone, zone_key_t, key);
1139 
1140                 (t->zsd_shutdown)(zone->zone_id, data);
1141                 DTRACE_PROBE2(zsd__shutdown__end,
1142                     zone_t *, zone, zone_key_t, key);
1143 
1144                 if (lockp != NULL)
1145                         mutex_enter(lockp);
1146                 mutex_enter(&zone->zone_lock);
1147                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1148                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1149                 cv_broadcast(&t->zsd_cv);
1150                 DTRACE_PROBE2(zsd__shutdown__completed,
1151                     zone_t *, zone, zone_key_t, key);
1152         }
1153         if (!zone_lock_held)
1154                 mutex_exit(&zone->zone_lock);
1155         return (dropped);
1156 }
1157 
1158 /*
1159  * Call the destroy function for the zone and key if DESTROY_NEEDED
1160  * is set.
1161  * If some other thread gets here first and sets *_INPROGRESS, then
1162  * we wait for that thread to complete so that we can ensure that
1163  * all the callbacks are done when we've looped over all zones/keys.
1164  *
1165  * When we call the destroy function, we drop the global held by the
1166  * caller, and return true to tell the caller it needs to re-evalute the
1167  * state.
1168  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1169  * remains held on exit.
1170  */
1171 static boolean_t
1172 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1173     zone_t *zone, zone_key_t key)
1174 {
1175         struct zsd_entry *t;
1176         void *data;
1177         boolean_t dropped;
1178 
1179         if (lockp != NULL) {
1180                 ASSERT(MUTEX_HELD(lockp));
1181         }
1182         if (zone_lock_held) {
1183                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1184         } else {
1185                 mutex_enter(&zone->zone_lock);
1186         }
1187 
1188         t = zsd_find(&zone->zone_zsd, key);
1189         if (t == NULL) {
1190                 /*
1191                  * Somebody else got here first e.g the zone going
1192                  * away.
1193                  */
1194                 if (!zone_lock_held)
1195                         mutex_exit(&zone->zone_lock);
1196                 return (B_FALSE);
1197         }
1198         dropped = B_FALSE;
1199         if (zsd_wait_for_creator(zone, t, lockp))
1200                 dropped = B_TRUE;
1201 
1202         if (zsd_wait_for_inprogress(zone, t, lockp))
1203                 dropped = B_TRUE;
1204 
1205         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1206                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1207                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1208                 DTRACE_PROBE2(zsd__destroy__inprogress,
1209                     zone_t *, zone, zone_key_t, key);
1210                 mutex_exit(&zone->zone_lock);
1211                 if (lockp != NULL)
1212                         mutex_exit(lockp);
1213                 dropped = B_TRUE;
1214 
1215                 ASSERT(t->zsd_destroy != NULL);
1216                 data = t->zsd_data;
1217                 DTRACE_PROBE2(zsd__destroy__start,
1218                     zone_t *, zone, zone_key_t, key);
1219 
1220                 (t->zsd_destroy)(zone->zone_id, data);
1221                 DTRACE_PROBE2(zsd__destroy__end,
1222                     zone_t *, zone, zone_key_t, key);
1223 
1224                 if (lockp != NULL)
1225                         mutex_enter(lockp);
1226                 mutex_enter(&zone->zone_lock);
1227                 t->zsd_data = NULL;
1228                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1229                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1230                 cv_broadcast(&t->zsd_cv);
1231                 DTRACE_PROBE2(zsd__destroy__completed,
1232                     zone_t *, zone, zone_key_t, key);
1233         }
1234         if (!zone_lock_held)
1235                 mutex_exit(&zone->zone_lock);
1236         return (dropped);
1237 }
1238 
1239 /*
1240  * Wait for any CREATE_NEEDED flag to be cleared.
1241  * Returns true if lockp was temporarily dropped while waiting.
1242  */
1243 static boolean_t
1244 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1245 {
1246         boolean_t dropped = B_FALSE;
1247 
1248         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1249                 DTRACE_PROBE2(zsd__wait__for__creator,
1250                     zone_t *, zone, struct zsd_entry *, t);
1251                 if (lockp != NULL) {
1252                         dropped = B_TRUE;
1253                         mutex_exit(lockp);
1254                 }
1255                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1256                 if (lockp != NULL) {
1257                         /* First drop zone_lock to preserve order */
1258                         mutex_exit(&zone->zone_lock);
1259                         mutex_enter(lockp);
1260                         mutex_enter(&zone->zone_lock);
1261                 }
1262         }
1263         return (dropped);
1264 }
1265 
1266 /*
1267  * Wait for any INPROGRESS flag to be cleared.
1268  * Returns true if lockp was temporarily dropped while waiting.
1269  */
1270 static boolean_t
1271 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1272 {
1273         boolean_t dropped = B_FALSE;
1274 
1275         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1276                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1277                     zone_t *, zone, struct zsd_entry *, t);
1278                 if (lockp != NULL) {
1279                         dropped = B_TRUE;
1280                         mutex_exit(lockp);
1281                 }
1282                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1283                 if (lockp != NULL) {
1284                         /* First drop zone_lock to preserve order */
1285                         mutex_exit(&zone->zone_lock);
1286                         mutex_enter(lockp);
1287                         mutex_enter(&zone->zone_lock);
1288                 }
1289         }
1290         return (dropped);
1291 }
1292 
1293 /*
1294  * Frees memory associated with the zone dataset list.
1295  */
1296 static void
1297 zone_free_datasets(zone_t *zone)
1298 {
1299         zone_dataset_t *t, *next;
1300 
1301         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1302                 next = list_next(&zone->zone_datasets, t);
1303                 list_remove(&zone->zone_datasets, t);
1304                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1305                 kmem_free(t, sizeof (*t));
1306         }
1307         list_destroy(&zone->zone_datasets);
1308 }
1309 
1310 /*
1311  * zone.cpu-shares resource control support.
1312  */
1313 /*ARGSUSED*/
1314 static rctl_qty_t
1315 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1316 {
1317         ASSERT(MUTEX_HELD(&p->p_lock));
1318         return (p->p_zone->zone_shares);
1319 }
1320 
1321 /*ARGSUSED*/
1322 static int
1323 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1324     rctl_qty_t nv)
1325 {
1326         ASSERT(MUTEX_HELD(&p->p_lock));
1327         ASSERT(e->rcep_t == RCENTITY_ZONE);
1328         if (e->rcep_p.zone == NULL)
1329                 return (0);
1330 
1331         e->rcep_p.zone->zone_shares = nv;
1332         return (0);
1333 }
1334 
1335 static rctl_ops_t zone_cpu_shares_ops = {
1336         rcop_no_action,
1337         zone_cpu_shares_usage,
1338         zone_cpu_shares_set,
1339         rcop_no_test
1340 };
1341 
1342 /*
1343  * zone.cpu-cap resource control support.
1344  */
1345 /*ARGSUSED*/
1346 static rctl_qty_t
1347 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1348 {
1349         ASSERT(MUTEX_HELD(&p->p_lock));
1350         return (cpucaps_zone_get(p->p_zone));
1351 }
1352 
1353 /*ARGSUSED*/
1354 static int
1355 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1356     rctl_qty_t nv)
1357 {
1358         zone_t *zone = e->rcep_p.zone;
1359 
1360         ASSERT(MUTEX_HELD(&p->p_lock));
1361         ASSERT(e->rcep_t == RCENTITY_ZONE);
1362 
1363         if (zone == NULL)
1364                 return (0);
1365 
1366         /*
1367          * set cap to the new value.
1368          */
1369         return (cpucaps_zone_set(zone, nv));
1370 }
1371 
1372 static rctl_ops_t zone_cpu_cap_ops = {
1373         rcop_no_action,
1374         zone_cpu_cap_get,
1375         zone_cpu_cap_set,
1376         rcop_no_test
1377 };
1378 
1379 /*ARGSUSED*/
1380 static rctl_qty_t
1381 zone_lwps_usage(rctl_t *r, proc_t *p)
1382 {
1383         rctl_qty_t nlwps;
1384         zone_t *zone = p->p_zone;
1385 
1386         ASSERT(MUTEX_HELD(&p->p_lock));
1387 
1388         mutex_enter(&zone->zone_nlwps_lock);
1389         nlwps = zone->zone_nlwps;
1390         mutex_exit(&zone->zone_nlwps_lock);
1391 
1392         return (nlwps);
1393 }
1394 
1395 /*ARGSUSED*/
1396 static int
1397 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1398     rctl_qty_t incr, uint_t flags)
1399 {
1400         rctl_qty_t nlwps;
1401 
1402         ASSERT(MUTEX_HELD(&p->p_lock));
1403         ASSERT(e->rcep_t == RCENTITY_ZONE);
1404         if (e->rcep_p.zone == NULL)
1405                 return (0);
1406         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1407         nlwps = e->rcep_p.zone->zone_nlwps;
1408 
1409         if (nlwps + incr > rcntl->rcv_value)
1410                 return (1);
1411 
1412         return (0);
1413 }
1414 
1415 /*ARGSUSED*/
1416 static int
1417 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1418 {
1419         ASSERT(MUTEX_HELD(&p->p_lock));
1420         ASSERT(e->rcep_t == RCENTITY_ZONE);
1421         if (e->rcep_p.zone == NULL)
1422                 return (0);
1423         e->rcep_p.zone->zone_nlwps_ctl = nv;
1424         return (0);
1425 }
1426 
1427 static rctl_ops_t zone_lwps_ops = {
1428         rcop_no_action,
1429         zone_lwps_usage,
1430         zone_lwps_set,
1431         zone_lwps_test,
1432 };
1433 
1434 /*ARGSUSED*/
1435 static rctl_qty_t
1436 zone_procs_usage(rctl_t *r, proc_t *p)
1437 {
1438         rctl_qty_t nprocs;
1439         zone_t *zone = p->p_zone;
1440 
1441         ASSERT(MUTEX_HELD(&p->p_lock));
1442 
1443         mutex_enter(&zone->zone_nlwps_lock);
1444         nprocs = zone->zone_nprocs;
1445         mutex_exit(&zone->zone_nlwps_lock);
1446 
1447         return (nprocs);
1448 }
1449 
1450 /*ARGSUSED*/
1451 static int
1452 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1453     rctl_qty_t incr, uint_t flags)
1454 {
1455         rctl_qty_t nprocs;
1456 
1457         ASSERT(MUTEX_HELD(&p->p_lock));
1458         ASSERT(e->rcep_t == RCENTITY_ZONE);
1459         if (e->rcep_p.zone == NULL)
1460                 return (0);
1461         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1462         nprocs = e->rcep_p.zone->zone_nprocs;
1463 
1464         if (nprocs + incr > rcntl->rcv_value)
1465                 return (1);
1466 
1467         return (0);
1468 }
1469 
1470 /*ARGSUSED*/
1471 static int
1472 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1473 {
1474         ASSERT(MUTEX_HELD(&p->p_lock));
1475         ASSERT(e->rcep_t == RCENTITY_ZONE);
1476         if (e->rcep_p.zone == NULL)
1477                 return (0);
1478         e->rcep_p.zone->zone_nprocs_ctl = nv;
1479         return (0);
1480 }
1481 
1482 static rctl_ops_t zone_procs_ops = {
1483         rcop_no_action,
1484         zone_procs_usage,
1485         zone_procs_set,
1486         zone_procs_test,
1487 };
1488 
1489 /*ARGSUSED*/
1490 static int
1491 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1492     rctl_qty_t incr, uint_t flags)
1493 {
1494         rctl_qty_t v;
1495         ASSERT(MUTEX_HELD(&p->p_lock));
1496         ASSERT(e->rcep_t == RCENTITY_ZONE);
1497         v = e->rcep_p.zone->zone_shmmax + incr;
1498         if (v > rval->rcv_value)
1499                 return (1);
1500         return (0);
1501 }
1502 
1503 static rctl_ops_t zone_shmmax_ops = {
1504         rcop_no_action,
1505         rcop_no_usage,
1506         rcop_no_set,
1507         zone_shmmax_test
1508 };
1509 
1510 /*ARGSUSED*/
1511 static int
1512 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1513     rctl_qty_t incr, uint_t flags)
1514 {
1515         rctl_qty_t v;
1516         ASSERT(MUTEX_HELD(&p->p_lock));
1517         ASSERT(e->rcep_t == RCENTITY_ZONE);
1518         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1519         if (v > rval->rcv_value)
1520                 return (1);
1521         return (0);
1522 }
1523 
1524 static rctl_ops_t zone_shmmni_ops = {
1525         rcop_no_action,
1526         rcop_no_usage,
1527         rcop_no_set,
1528         zone_shmmni_test
1529 };
1530 
1531 /*ARGSUSED*/
1532 static int
1533 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1534     rctl_qty_t incr, uint_t flags)
1535 {
1536         rctl_qty_t v;
1537         ASSERT(MUTEX_HELD(&p->p_lock));
1538         ASSERT(e->rcep_t == RCENTITY_ZONE);
1539         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1540         if (v > rval->rcv_value)
1541                 return (1);
1542         return (0);
1543 }
1544 
1545 static rctl_ops_t zone_semmni_ops = {
1546         rcop_no_action,
1547         rcop_no_usage,
1548         rcop_no_set,
1549         zone_semmni_test
1550 };
1551 
1552 /*ARGSUSED*/
1553 static int
1554 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1555     rctl_qty_t incr, uint_t flags)
1556 {
1557         rctl_qty_t v;
1558         ASSERT(MUTEX_HELD(&p->p_lock));
1559         ASSERT(e->rcep_t == RCENTITY_ZONE);
1560         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1561         if (v > rval->rcv_value)
1562                 return (1);
1563         return (0);
1564 }
1565 
1566 static rctl_ops_t zone_msgmni_ops = {
1567         rcop_no_action,
1568         rcop_no_usage,
1569         rcop_no_set,
1570         zone_msgmni_test
1571 };
1572 
1573 /*ARGSUSED*/
1574 static rctl_qty_t
1575 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1576 {
1577         rctl_qty_t q;
1578         ASSERT(MUTEX_HELD(&p->p_lock));
1579         mutex_enter(&p->p_zone->zone_mem_lock);
1580         q = p->p_zone->zone_locked_mem;
1581         mutex_exit(&p->p_zone->zone_mem_lock);
1582         return (q);
1583 }
1584 
1585 /*ARGSUSED*/
1586 static int
1587 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1588     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1589 {
1590         rctl_qty_t q;
1591         zone_t *z;
1592 
1593         z = e->rcep_p.zone;
1594         ASSERT(MUTEX_HELD(&p->p_lock));
1595         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1596         q = z->zone_locked_mem;
1597         if (q + incr > rcntl->rcv_value)
1598                 return (1);
1599         return (0);
1600 }
1601 
1602 /*ARGSUSED*/
1603 static int
1604 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1605     rctl_qty_t nv)
1606 {
1607         ASSERT(MUTEX_HELD(&p->p_lock));
1608         ASSERT(e->rcep_t == RCENTITY_ZONE);
1609         if (e->rcep_p.zone == NULL)
1610                 return (0);
1611         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1612         return (0);
1613 }
1614 
1615 static rctl_ops_t zone_locked_mem_ops = {
1616         rcop_no_action,
1617         zone_locked_mem_usage,
1618         zone_locked_mem_set,
1619         zone_locked_mem_test
1620 };
1621 
1622 /*ARGSUSED*/
1623 static rctl_qty_t
1624 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1625 {
1626         rctl_qty_t q;
1627         zone_t *z = p->p_zone;
1628 
1629         ASSERT(MUTEX_HELD(&p->p_lock));
1630         mutex_enter(&z->zone_mem_lock);
1631         q = z->zone_max_swap;
1632         mutex_exit(&z->zone_mem_lock);
1633         return (q);
1634 }
1635 
1636 /*ARGSUSED*/
1637 static int
1638 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1639     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1640 {
1641         rctl_qty_t q;
1642         zone_t *z;
1643 
1644         z = e->rcep_p.zone;
1645         ASSERT(MUTEX_HELD(&p->p_lock));
1646         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1647         q = z->zone_max_swap;
1648         if (q + incr > rcntl->rcv_value)
1649                 return (1);
1650         return (0);
1651 }
1652 
1653 /*ARGSUSED*/
1654 static int
1655 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1656     rctl_qty_t nv)
1657 {
1658         ASSERT(MUTEX_HELD(&p->p_lock));
1659         ASSERT(e->rcep_t == RCENTITY_ZONE);
1660         if (e->rcep_p.zone == NULL)
1661                 return (0);
1662         e->rcep_p.zone->zone_max_swap_ctl = nv;
1663         return (0);
1664 }
1665 
1666 static rctl_ops_t zone_max_swap_ops = {
1667         rcop_no_action,
1668         zone_max_swap_usage,
1669         zone_max_swap_set,
1670         zone_max_swap_test
1671 };
1672 
1673 /*ARGSUSED*/
1674 static rctl_qty_t
1675 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1676 {
1677         rctl_qty_t q;
1678         zone_t *z = p->p_zone;
1679 
1680         ASSERT(MUTEX_HELD(&p->p_lock));
1681         mutex_enter(&z->zone_rctl_lock);
1682         q = z->zone_max_lofi;
1683         mutex_exit(&z->zone_rctl_lock);
1684         return (q);
1685 }
1686 
1687 /*ARGSUSED*/
1688 static int
1689 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1690     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1691 {
1692         rctl_qty_t q;
1693         zone_t *z;
1694 
1695         z = e->rcep_p.zone;
1696         ASSERT(MUTEX_HELD(&p->p_lock));
1697         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1698         q = z->zone_max_lofi;
1699         if (q + incr > rcntl->rcv_value)
1700                 return (1);
1701         return (0);
1702 }
1703 
1704 /*ARGSUSED*/
1705 static int
1706 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1707     rctl_qty_t nv)
1708 {
1709         ASSERT(MUTEX_HELD(&p->p_lock));
1710         ASSERT(e->rcep_t == RCENTITY_ZONE);
1711         if (e->rcep_p.zone == NULL)
1712                 return (0);
1713         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1714         return (0);
1715 }
1716 
1717 static rctl_ops_t zone_max_lofi_ops = {
1718         rcop_no_action,
1719         zone_max_lofi_usage,
1720         zone_max_lofi_set,
1721         zone_max_lofi_test
1722 };
1723 
1724 /*
1725  * Helper function to brand the zone with a unique ID.
1726  */
1727 static void
1728 zone_uniqid(zone_t *zone)
1729 {
1730         static uint64_t uniqid = 0;
1731 
1732         ASSERT(MUTEX_HELD(&zonehash_lock));
1733         zone->zone_uniqid = uniqid++;
1734 }
1735 
1736 /*
1737  * Returns a held pointer to the "kcred" for the specified zone.
1738  */
1739 struct cred *
1740 zone_get_kcred(zoneid_t zoneid)
1741 {
1742         zone_t *zone;
1743         cred_t *cr;
1744 
1745         if ((zone = zone_find_by_id(zoneid)) == NULL)
1746                 return (NULL);
1747         cr = zone->zone_kcred;
1748         crhold(cr);
1749         zone_rele(zone);
1750         return (cr);
1751 }
1752 
1753 static int
1754 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1755 {
1756         zone_t *zone = ksp->ks_private;
1757         zone_kstat_t *zk = ksp->ks_data;
1758 
1759         if (rw == KSTAT_WRITE)
1760                 return (EACCES);
1761 
1762         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1763         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1764         return (0);
1765 }
1766 
1767 static int
1768 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1769 {
1770         zone_t *zone = ksp->ks_private;
1771         zone_kstat_t *zk = ksp->ks_data;
1772 
1773         if (rw == KSTAT_WRITE)
1774                 return (EACCES);
1775 
1776         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1777         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1778         return (0);
1779 }
1780 
1781 static int
1782 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1783 {
1784         zone_t *zone = ksp->ks_private;
1785         zone_kstat_t *zk = ksp->ks_data;
1786 
1787         if (rw == KSTAT_WRITE)
1788                 return (EACCES);
1789 
1790         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1791         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1792         return (0);
1793 }
1794 
1795 static kstat_t *
1796 zone_kstat_create_common(zone_t *zone, char *name,
1797     int (*updatefunc) (kstat_t *, int))
1798 {
1799         kstat_t *ksp;
1800         zone_kstat_t *zk;
1801 
1802         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1803             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1804             KSTAT_FLAG_VIRTUAL);
1805 
1806         if (ksp == NULL)
1807                 return (NULL);
1808 
1809         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1810         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1811         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1812         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1813         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1814         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1815         ksp->ks_update = updatefunc;
1816         ksp->ks_private = zone;
1817         kstat_install(ksp);
1818         return (ksp);
1819 }
1820 
1821 static int
1822 zone_misc_kstat_update(kstat_t *ksp, int rw)
1823 {
1824         zone_t *zone = ksp->ks_private;
1825         zone_misc_kstat_t *zmp = ksp->ks_data;
1826         hrtime_t tmp;
1827 
1828         if (rw == KSTAT_WRITE)
1829                 return (EACCES);
1830 
1831         tmp = zone->zone_utime;
1832         scalehrtime(&tmp);
1833         zmp->zm_utime.value.ui64 = tmp;
1834         tmp = zone->zone_stime;
1835         scalehrtime(&tmp);
1836         zmp->zm_stime.value.ui64 = tmp;
1837         tmp = zone->zone_wtime;
1838         scalehrtime(&tmp);
1839         zmp->zm_wtime.value.ui64 = tmp;
1840 
1841         zmp->zm_boot_hrtime.value.t = zone->zone_boot_hrtime;
1842 
1843         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1844         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1845         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1846 
1847         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1848         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1849         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1850         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1851 
1852         return (0);
1853 }
1854 
1855 static kstat_t *
1856 zone_misc_kstat_create(zone_t *zone)
1857 {
1858         kstat_t *ksp;
1859         zone_misc_kstat_t *zmp;
1860 
1861         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1862             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1863             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1864             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1865                 return (NULL);
1866 
1867         if (zone->zone_id != GLOBAL_ZONEID)
1868                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1869 
1870         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1871         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1872         ksp->ks_lock = &zone->zone_misc_lock;
1873         zone->zone_misc_stats = zmp;
1874 
1875         /* The kstat "name" field is not large enough for a full zonename */
1876         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1877         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1878         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1879         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1880         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1881         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1882         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1883         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1884             KSTAT_DATA_UINT32);
1885         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1886         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1887             KSTAT_DATA_UINT32);
1888         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1889         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1890         kstat_named_init(&zmp->zm_boot_hrtime, "boot_hrtime", KSTAT_DATA_TIME);
1891 
1892 
1893         ksp->ks_update = zone_misc_kstat_update;
1894         ksp->ks_private = zone;
1895 
1896         kstat_install(ksp);
1897         return (ksp);
1898 }
1899 
1900 static void
1901 zone_kstat_create(zone_t *zone)
1902 {
1903         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1904             "lockedmem", zone_lockedmem_kstat_update);
1905         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1906             "swapresv", zone_swapresv_kstat_update);
1907         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1908             "nprocs", zone_nprocs_kstat_update);
1909 
1910         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
1911                 zone->zone_misc_stats = kmem_zalloc(
1912                     sizeof (zone_misc_kstat_t), KM_SLEEP);
1913         }
1914 }
1915 
1916 static void
1917 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
1918 {
1919         void *data;
1920 
1921         if (*pkstat != NULL) {
1922                 data = (*pkstat)->ks_data;
1923                 kstat_delete(*pkstat);
1924                 kmem_free(data, datasz);
1925                 *pkstat = NULL;
1926         }
1927 }
1928 
1929 static void
1930 zone_kstat_delete(zone_t *zone)
1931 {
1932         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
1933             sizeof (zone_kstat_t));
1934         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
1935             sizeof (zone_kstat_t));
1936         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
1937             sizeof (zone_kstat_t));
1938         zone_kstat_delete_common(&zone->zone_misc_ksp,
1939             sizeof (zone_misc_kstat_t));
1940 }
1941 
1942 /*
1943  * Called very early on in boot to initialize the ZSD list so that
1944  * zone_key_create() can be called before zone_init().  It also initializes
1945  * portions of zone0 which may be used before zone_init() is called.  The
1946  * variable "global_zone" will be set when zone0 is fully initialized by
1947  * zone_init().
1948  */
1949 void
1950 zone_zsd_init(void)
1951 {
1952         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1953         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1954         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1955             offsetof(struct zsd_entry, zsd_linkage));
1956         list_create(&zone_active, sizeof (zone_t),
1957             offsetof(zone_t, zone_linkage));
1958         list_create(&zone_deathrow, sizeof (zone_t),
1959             offsetof(zone_t, zone_linkage));
1960 
1961         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1962         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1963         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1964         zone0.zone_shares = 1;
1965         zone0.zone_nlwps = 0;
1966         zone0.zone_nlwps_ctl = INT_MAX;
1967         zone0.zone_nprocs = 0;
1968         zone0.zone_nprocs_ctl = INT_MAX;
1969         zone0.zone_locked_mem = 0;
1970         zone0.zone_locked_mem_ctl = UINT64_MAX;
1971         ASSERT(zone0.zone_max_swap == 0);
1972         zone0.zone_max_swap_ctl = UINT64_MAX;
1973         zone0.zone_max_lofi = 0;
1974         zone0.zone_max_lofi_ctl = UINT64_MAX;
1975         zone0.zone_shmmax = 0;
1976         zone0.zone_ipc.ipcq_shmmni = 0;
1977         zone0.zone_ipc.ipcq_semmni = 0;
1978         zone0.zone_ipc.ipcq_msgmni = 0;
1979         zone0.zone_name = GLOBAL_ZONENAME;
1980         zone0.zone_nodename = utsname.nodename;
1981         zone0.zone_domain = srpc_domain;
1982         zone0.zone_hostid = HW_INVALID_HOSTID;
1983         zone0.zone_fs_allowed = NULL;
1984         zone0.zone_ref = 1;
1985         zone0.zone_id = GLOBAL_ZONEID;
1986         zone0.zone_status = ZONE_IS_RUNNING;
1987         zone0.zone_rootpath = "/";
1988         zone0.zone_rootpathlen = 2;
1989         zone0.zone_psetid = ZONE_PS_INVAL;
1990         zone0.zone_ncpus = 0;
1991         zone0.zone_ncpus_online = 0;
1992         zone0.zone_proc_initpid = 1;
1993         zone0.zone_initname = initname;
1994         zone0.zone_lockedmem_kstat = NULL;
1995         zone0.zone_swapresv_kstat = NULL;
1996         zone0.zone_nprocs_kstat = NULL;
1997 
1998         zone0.zone_stime = 0;
1999         zone0.zone_utime = 0;
2000         zone0.zone_wtime = 0;
2001 
2002         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2003             offsetof(zone_ref_t, zref_linkage));
2004         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2005             offsetof(struct zsd_entry, zsd_linkage));
2006         list_insert_head(&zone_active, &zone0);
2007 
2008         /*
2009          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2010          * to anything meaningful.  It is assigned to be 'rootdir' in
2011          * vfs_mountroot().
2012          */
2013         zone0.zone_rootvp = NULL;
2014         zone0.zone_vfslist = NULL;
2015         zone0.zone_bootargs = initargs;
2016         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2017         /*
2018          * The global zone has all privileges
2019          */
2020         priv_fillset(zone0.zone_privset);
2021         /*
2022          * Add p0 to the global zone
2023          */
2024         zone0.zone_zsched = &p0;
2025         p0.p_zone = &zone0;
2026 }
2027 
2028 /*
2029  * Compute a hash value based on the contents of the label and the DOI.  The
2030  * hash algorithm is somewhat arbitrary, but is based on the observation that
2031  * humans will likely pick labels that differ by amounts that work out to be
2032  * multiples of the number of hash chains, and thus stirring in some primes
2033  * should help.
2034  */
2035 static uint_t
2036 hash_bylabel(void *hdata, mod_hash_key_t key)
2037 {
2038         const ts_label_t *lab = (ts_label_t *)key;
2039         const uint32_t *up, *ue;
2040         uint_t hash;
2041         int i;
2042 
2043         _NOTE(ARGUNUSED(hdata));
2044 
2045         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2046         /* we depend on alignment of label, but not representation */
2047         up = (const uint32_t *)&lab->tsl_label;
2048         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2049         i = 1;
2050         while (up < ue) {
2051                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2052                 hash += *up + (*up << ((i % 16) + 1));
2053                 up++;
2054                 i++;
2055         }
2056         return (hash);
2057 }
2058 
2059 /*
2060  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2061  * equal).  This may need to be changed if less than / greater than is ever
2062  * needed.
2063  */
2064 static int
2065 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2066 {
2067         ts_label_t *lab1 = (ts_label_t *)key1;
2068         ts_label_t *lab2 = (ts_label_t *)key2;
2069 
2070         return (label_equal(lab1, lab2) ? 0 : 1);
2071 }
2072 
2073 /*
2074  * Called by main() to initialize the zones framework.
2075  */
2076 void
2077 zone_init(void)
2078 {
2079         rctl_dict_entry_t *rde;
2080         rctl_val_t *dval;
2081         rctl_set_t *set;
2082         rctl_alloc_gp_t *gp;
2083         rctl_entity_p_t e;
2084         int res;
2085 
2086         ASSERT(curproc == &p0);
2087 
2088         /*
2089          * Create ID space for zone IDs.  ID 0 is reserved for the
2090          * global zone.
2091          */
2092         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2093 
2094         /*
2095          * Initialize generic zone resource controls, if any.
2096          */
2097         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2098             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2099             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2100             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2101 
2102         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2103             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2104             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2105             RCTL_GLOBAL_INFINITE,
2106             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2107 
2108         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2109             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2110             INT_MAX, INT_MAX, &zone_lwps_ops);
2111 
2112         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2113             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2114             INT_MAX, INT_MAX, &zone_procs_ops);
2115 
2116         /*
2117          * System V IPC resource controls
2118          */
2119         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2120             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2121             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2122 
2123         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2124             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2125             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2126 
2127         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2128             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2129             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2130 
2131         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2132             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2133             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2134 
2135         /*
2136          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2137          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2138          */
2139         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2140         bzero(dval, sizeof (rctl_val_t));
2141         dval->rcv_value = 1;
2142         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2143         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2144         dval->rcv_action_recip_pid = -1;
2145 
2146         rde = rctl_dict_lookup("zone.cpu-shares");
2147         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2148 
2149         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2150             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2151             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2152             &zone_locked_mem_ops);
2153 
2154         rc_zone_max_swap = rctl_register("zone.max-swap",
2155             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2156             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2157             &zone_max_swap_ops);
2158 
2159         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2160             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2161             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2162             &zone_max_lofi_ops);
2163 
2164         /*
2165          * Initialize the ``global zone''.
2166          */
2167         set = rctl_set_create();
2168         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2169         mutex_enter(&p0.p_lock);
2170         e.rcep_p.zone = &zone0;
2171         e.rcep_t = RCENTITY_ZONE;
2172         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2173             gp);
2174 
2175         zone0.zone_nlwps = p0.p_lwpcnt;
2176         zone0.zone_nprocs = 1;
2177         zone0.zone_ntasks = 1;
2178         mutex_exit(&p0.p_lock);
2179         zone0.zone_restart_init = B_TRUE;
2180         zone0.zone_brand = &native_brand;
2181         rctl_prealloc_destroy(gp);
2182         /*
2183          * pool_default hasn't been initialized yet, so we let pool_init()
2184          * take care of making sure the global zone is in the default pool.
2185          */
2186 
2187         /*
2188          * Initialize global zone kstats
2189          */
2190         zone_kstat_create(&zone0);
2191 
2192         /*
2193          * Initialize zone label.
2194          * mlp are initialized when tnzonecfg is loaded.
2195          */
2196         zone0.zone_slabel = l_admin_low;
2197         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2198         label_hold(l_admin_low);
2199 
2200         /*
2201          * Initialise the lock for the database structure used by mntfs.
2202          */
2203         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2204 
2205         mutex_enter(&zonehash_lock);
2206         zone_uniqid(&zone0);
2207         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2208 
2209         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2210             mod_hash_null_valdtor);
2211         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2212             zone_hash_size, mod_hash_null_valdtor);
2213         /*
2214          * maintain zonehashbylabel only for labeled systems
2215          */
2216         if (is_system_labeled())
2217                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2218                     zone_hash_size, mod_hash_null_keydtor,
2219                     mod_hash_null_valdtor, hash_bylabel, NULL,
2220                     hash_labelkey_cmp, KM_SLEEP);
2221         zonecount = 1;
2222 
2223         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2224             (mod_hash_val_t)&zone0);
2225         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2226             (mod_hash_val_t)&zone0);
2227         if (is_system_labeled()) {
2228                 zone0.zone_flags |= ZF_HASHED_LABEL;
2229                 (void) mod_hash_insert(zonehashbylabel,
2230                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2231         }
2232         mutex_exit(&zonehash_lock);
2233 
2234         /*
2235          * We avoid setting zone_kcred until now, since kcred is initialized
2236          * sometime after zone_zsd_init() and before zone_init().
2237          */
2238         zone0.zone_kcred = kcred;
2239         /*
2240          * The global zone is fully initialized (except for zone_rootvp which
2241          * will be set when the root filesystem is mounted).
2242          */
2243         global_zone = &zone0;
2244 
2245         /*
2246          * Setup an event channel to send zone status change notifications on
2247          */
2248         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2249             EVCH_CREAT);
2250 
2251         if (res)
2252                 panic("Sysevent_evc_bind failed during zone setup.\n");
2253 
2254 }
2255 
2256 static void
2257 zone_free(zone_t *zone)
2258 {
2259         ASSERT(zone != global_zone);
2260         ASSERT(zone->zone_ntasks == 0);
2261         ASSERT(zone->zone_nlwps == 0);
2262         ASSERT(zone->zone_nprocs == 0);
2263         ASSERT(zone->zone_cred_ref == 0);
2264         ASSERT(zone->zone_kcred == NULL);
2265         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2266             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2267         ASSERT(list_is_empty(&zone->zone_ref_list));
2268 
2269         /*
2270          * Remove any zone caps.
2271          */
2272         cpucaps_zone_remove(zone);
2273 
2274         ASSERT(zone->zone_cpucap == NULL);
2275 
2276         /* remove from deathrow list */
2277         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2278                 ASSERT(zone->zone_ref == 0);
2279                 mutex_enter(&zone_deathrow_lock);
2280                 list_remove(&zone_deathrow, zone);
2281                 mutex_exit(&zone_deathrow_lock);
2282         }
2283 
2284         list_destroy(&zone->zone_ref_list);
2285         zone_free_zsd(zone);
2286         zone_free_datasets(zone);
2287         list_destroy(&zone->zone_dl_list);
2288 
2289         if (zone->zone_rootvp != NULL)
2290                 VN_RELE(zone->zone_rootvp);
2291         if (zone->zone_rootpath)
2292                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2293         if (zone->zone_name != NULL)
2294                 kmem_free(zone->zone_name, ZONENAME_MAX);
2295         if (zone->zone_slabel != NULL)
2296                 label_rele(zone->zone_slabel);
2297         if (zone->zone_nodename != NULL)
2298                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2299         if (zone->zone_domain != NULL)
2300                 kmem_free(zone->zone_domain, _SYS_NMLN);
2301         if (zone->zone_privset != NULL)
2302                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2303         if (zone->zone_rctls != NULL)
2304                 rctl_set_free(zone->zone_rctls);
2305         if (zone->zone_bootargs != NULL)
2306                 strfree(zone->zone_bootargs);
2307         if (zone->zone_initname != NULL)
2308                 strfree(zone->zone_initname);
2309         if (zone->zone_fs_allowed != NULL)
2310                 strfree(zone->zone_fs_allowed);
2311         if (zone->zone_pfexecd != NULL)
2312                 klpd_freelist(&zone->zone_pfexecd);
2313         id_free(zoneid_space, zone->zone_id);
2314         mutex_destroy(&zone->zone_lock);
2315         cv_destroy(&zone->zone_cv);
2316         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2317         rw_destroy(&zone->zone_mntfs_db_lock);
2318         kmem_free(zone, sizeof (zone_t));
2319 }
2320 
2321 /*
2322  * See block comment at the top of this file for information about zone
2323  * status values.
2324  */
2325 /*
2326  * Convenience function for setting zone status.
2327  */
2328 static void
2329 zone_status_set(zone_t *zone, zone_status_t status)
2330 {
2331 
2332         nvlist_t *nvl = NULL;
2333         ASSERT(MUTEX_HELD(&zone_status_lock));
2334         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2335             status >= zone_status_get(zone));
2336 
2337         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2338             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2339             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2340             zone_status_table[status]) ||
2341             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2342             zone_status_table[zone->zone_status]) ||
2343             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2344             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2345             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2346             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2347 #ifdef DEBUG
2348                 (void) printf(
2349                     "Failed to allocate and send zone state change event.\n");
2350 #endif
2351         }
2352         nvlist_free(nvl);
2353 
2354         zone->zone_status = status;
2355 
2356         cv_broadcast(&zone->zone_cv);
2357 }
2358 
2359 /*
2360  * Public function to retrieve the zone status.  The zone status may
2361  * change after it is retrieved.
2362  */
2363 zone_status_t
2364 zone_status_get(zone_t *zone)
2365 {
2366         return (zone->zone_status);
2367 }
2368 
2369 static int
2370 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2371 {
2372         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2373         int err = 0;
2374 
2375         ASSERT(zone != global_zone);
2376         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2377                 goto done;      /* EFAULT or ENAMETOOLONG */
2378 
2379         if (zone->zone_bootargs != NULL)
2380                 strfree(zone->zone_bootargs);
2381 
2382         zone->zone_bootargs = strdup(buf);
2383 
2384 done:
2385         kmem_free(buf, BOOTARGS_MAX);
2386         return (err);
2387 }
2388 
2389 static int
2390 zone_set_brand(zone_t *zone, const char *brand)
2391 {
2392         struct brand_attr *attrp;
2393         brand_t *bp;
2394 
2395         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2396         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2397                 kmem_free(attrp, sizeof (struct brand_attr));
2398                 return (EFAULT);
2399         }
2400 
2401         bp = brand_register_zone(attrp);
2402         kmem_free(attrp, sizeof (struct brand_attr));
2403         if (bp == NULL)
2404                 return (EINVAL);
2405 
2406         /*
2407          * This is the only place where a zone can change it's brand.
2408          * We already need to hold zone_status_lock to check the zone
2409          * status, so we'll just use that lock to serialize zone
2410          * branding requests as well.
2411          */
2412         mutex_enter(&zone_status_lock);
2413 
2414         /* Re-Branding is not allowed and the zone can't be booted yet */
2415         if ((ZONE_IS_BRANDED(zone)) ||
2416             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2417                 mutex_exit(&zone_status_lock);
2418                 brand_unregister_zone(bp);
2419                 return (EINVAL);
2420         }
2421 
2422         /* set up the brand specific data */
2423         zone->zone_brand = bp;
2424         ZBROP(zone)->b_init_brand_data(zone);
2425 
2426         mutex_exit(&zone_status_lock);
2427         return (0);
2428 }
2429 
2430 static int
2431 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2432 {
2433         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2434         int err = 0;
2435 
2436         ASSERT(zone != global_zone);
2437         if ((err = copyinstr(zone_fs_allowed, buf,
2438             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2439                 goto done;
2440 
2441         if (zone->zone_fs_allowed != NULL)
2442                 strfree(zone->zone_fs_allowed);
2443 
2444         zone->zone_fs_allowed = strdup(buf);
2445 
2446 done:
2447         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2448         return (err);
2449 }
2450 
2451 static int
2452 zone_set_initname(zone_t *zone, const char *zone_initname)
2453 {
2454         char initname[INITNAME_SZ];
2455         size_t len;
2456         int err = 0;
2457 
2458         ASSERT(zone != global_zone);
2459         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2460                 return (err);   /* EFAULT or ENAMETOOLONG */
2461 
2462         if (zone->zone_initname != NULL)
2463                 strfree(zone->zone_initname);
2464 
2465         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2466         (void) strcpy(zone->zone_initname, initname);
2467         return (0);
2468 }
2469 
2470 static int
2471 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2472 {
2473         uint64_t mcap;
2474         int err = 0;
2475 
2476         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2477                 zone->zone_phys_mcap = mcap;
2478 
2479         return (err);
2480 }
2481 
2482 static int
2483 zone_set_sched_class(zone_t *zone, const char *new_class)
2484 {
2485         char sched_class[PC_CLNMSZ];
2486         id_t classid;
2487         int err;
2488 
2489         ASSERT(zone != global_zone);
2490         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2491                 return (err);   /* EFAULT or ENAMETOOLONG */
2492 
2493         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2494                 return (set_errno(EINVAL));
2495         zone->zone_defaultcid = classid;
2496         ASSERT(zone->zone_defaultcid > 0 &&
2497             zone->zone_defaultcid < loaded_classes);
2498 
2499         return (0);
2500 }
2501 
2502 /*
2503  * Block indefinitely waiting for (zone_status >= status)
2504  */
2505 void
2506 zone_status_wait(zone_t *zone, zone_status_t status)
2507 {
2508         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2509 
2510         mutex_enter(&zone_status_lock);
2511         while (zone->zone_status < status) {
2512                 cv_wait(&zone->zone_cv, &zone_status_lock);
2513         }
2514         mutex_exit(&zone_status_lock);
2515 }
2516 
2517 /*
2518  * Private CPR-safe version of zone_status_wait().
2519  */
2520 static void
2521 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2522 {
2523         callb_cpr_t cprinfo;
2524 
2525         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2526 
2527         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2528             str);
2529         mutex_enter(&zone_status_lock);
2530         while (zone->zone_status < status) {
2531                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2532                 cv_wait(&zone->zone_cv, &zone_status_lock);
2533                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2534         }
2535         /*
2536          * zone_status_lock is implicitly released by the following.
2537          */
2538         CALLB_CPR_EXIT(&cprinfo);
2539 }
2540 
2541 /*
2542  * Block until zone enters requested state or signal is received.  Return (0)
2543  * if signaled, non-zero otherwise.
2544  */
2545 int
2546 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2547 {
2548         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2549 
2550         mutex_enter(&zone_status_lock);
2551         while (zone->zone_status < status) {
2552                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2553                         mutex_exit(&zone_status_lock);
2554                         return (0);
2555                 }
2556         }
2557         mutex_exit(&zone_status_lock);
2558         return (1);
2559 }
2560 
2561 /*
2562  * Block until the zone enters the requested state or the timeout expires,
2563  * whichever happens first.  Return (-1) if operation timed out, time remaining
2564  * otherwise.
2565  */
2566 clock_t
2567 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2568 {
2569         clock_t timeleft = 0;
2570 
2571         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2572 
2573         mutex_enter(&zone_status_lock);
2574         while (zone->zone_status < status && timeleft != -1) {
2575                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2576         }
2577         mutex_exit(&zone_status_lock);
2578         return (timeleft);
2579 }
2580 
2581 /*
2582  * Block until the zone enters the requested state, the current process is
2583  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2584  * operation timed out, 0 if signaled, time remaining otherwise.
2585  */
2586 clock_t
2587 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2588 {
2589         clock_t timeleft = tim - ddi_get_lbolt();
2590 
2591         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2592 
2593         mutex_enter(&zone_status_lock);
2594         while (zone->zone_status < status) {
2595                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2596                     tim);
2597                 if (timeleft <= 0)
2598                         break;
2599         }
2600         mutex_exit(&zone_status_lock);
2601         return (timeleft);
2602 }
2603 
2604 /*
2605  * Zones have two reference counts: one for references from credential
2606  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2607  * This is so we can allow a zone to be rebooted while there are still
2608  * outstanding cred references, since certain drivers cache dblks (which
2609  * implicitly results in cached creds).  We wait for zone_ref to drop to
2610  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2611  * later freed when the zone_cred_ref drops to 0, though nothing other
2612  * than the zone id and privilege set should be accessed once the zone
2613  * is "dead".
2614  *
2615  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2616  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2617  * to 0.  This can be useful to flush out other sources of cached creds
2618  * that may be less innocuous than the driver case.
2619  *
2620  * Zones also provide a tracked reference counting mechanism in which zone
2621  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2622  * debuggers determine the sources of leaked zone references.  See
2623  * zone_hold_ref() and zone_rele_ref() below for more information.
2624  */
2625 
2626 int zone_wait_for_cred = 0;
2627 
2628 static void
2629 zone_hold_locked(zone_t *z)
2630 {
2631         ASSERT(MUTEX_HELD(&z->zone_lock));
2632         z->zone_ref++;
2633         ASSERT(z->zone_ref != 0);
2634 }
2635 
2636 /*
2637  * Increment the specified zone's reference count.  The zone's zone_t structure
2638  * will not be freed as long as the zone's reference count is nonzero.
2639  * Decrement the zone's reference count via zone_rele().
2640  *
2641  * NOTE: This function should only be used to hold zones for short periods of
2642  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2643  */
2644 void
2645 zone_hold(zone_t *z)
2646 {
2647         mutex_enter(&z->zone_lock);
2648         zone_hold_locked(z);
2649         mutex_exit(&z->zone_lock);
2650 }
2651 
2652 /*
2653  * If the non-cred ref count drops to 1 and either the cred ref count
2654  * is 0 or we aren't waiting for cred references, the zone is ready to
2655  * be destroyed.
2656  */
2657 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2658             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2659 
2660 /*
2661  * Common zone reference release function invoked by zone_rele() and
2662  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2663  * zone's subsystem-specific reference counters are not affected by the
2664  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2665  * removed from the specified zone's reference list.  ref must be non-NULL iff
2666  * subsys is not ZONE_REF_NUM_SUBSYS.
2667  */
2668 static void
2669 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2670 {
2671         boolean_t wakeup;
2672 
2673         mutex_enter(&z->zone_lock);
2674         ASSERT(z->zone_ref != 0);
2675         z->zone_ref--;
2676         if (subsys != ZONE_REF_NUM_SUBSYS) {
2677                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2678                 z->zone_subsys_ref[subsys]--;
2679                 list_remove(&z->zone_ref_list, ref);
2680         }
2681         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2682                 /* no more refs, free the structure */
2683                 mutex_exit(&z->zone_lock);
2684                 zone_free(z);
2685                 return;
2686         }
2687         /* signal zone_destroy so the zone can finish halting */
2688         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2689         mutex_exit(&z->zone_lock);
2690 
2691         if (wakeup) {
2692                 /*
2693                  * Grabbing zonehash_lock here effectively synchronizes with
2694                  * zone_destroy() to avoid missed signals.
2695                  */
2696                 mutex_enter(&zonehash_lock);
2697                 cv_broadcast(&zone_destroy_cv);
2698                 mutex_exit(&zonehash_lock);
2699         }
2700 }
2701 
2702 /*
2703  * Decrement the specified zone's reference count.  The specified zone will
2704  * cease to exist after this function returns if the reference count drops to
2705  * zero.  This function should be paired with zone_hold().
2706  */
2707 void
2708 zone_rele(zone_t *z)
2709 {
2710         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2711 }
2712 
2713 /*
2714  * Initialize a zone reference structure.  This function must be invoked for
2715  * a reference structure before the structure is passed to zone_hold_ref().
2716  */
2717 void
2718 zone_init_ref(zone_ref_t *ref)
2719 {
2720         ref->zref_zone = NULL;
2721         list_link_init(&ref->zref_linkage);
2722 }
2723 
2724 /*
2725  * Acquire a reference to zone z.  The caller must specify the
2726  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2727  * zone_ref_t structure will represent a reference to the specified zone.  Use
2728  * zone_rele_ref() to release the reference.
2729  *
2730  * The referenced zone_t structure will not be freed as long as the zone_t's
2731  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2732  * references.
2733  *
2734  * NOTE: The zone_ref_t structure must be initialized before it is used.
2735  * See zone_init_ref() above.
2736  */
2737 void
2738 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2739 {
2740         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2741 
2742         /*
2743          * Prevent consumers from reusing a reference structure before
2744          * releasing it.
2745          */
2746         VERIFY(ref->zref_zone == NULL);
2747 
2748         ref->zref_zone = z;
2749         mutex_enter(&z->zone_lock);
2750         zone_hold_locked(z);
2751         z->zone_subsys_ref[subsys]++;
2752         ASSERT(z->zone_subsys_ref[subsys] != 0);
2753         list_insert_head(&z->zone_ref_list, ref);
2754         mutex_exit(&z->zone_lock);
2755 }
2756 
2757 /*
2758  * Release the zone reference represented by the specified zone_ref_t.
2759  * The reference is invalid after it's released; however, the zone_ref_t
2760  * structure can be reused without having to invoke zone_init_ref().
2761  * subsys should be the same value that was passed to zone_hold_ref()
2762  * when the reference was acquired.
2763  */
2764 void
2765 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2766 {
2767         zone_rele_common(ref->zref_zone, ref, subsys);
2768 
2769         /*
2770          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2771          * when consumers dereference the reference.  This helps us catch
2772          * consumers who use released references.  Furthermore, this lets
2773          * consumers reuse the zone_ref_t structure without having to
2774          * invoke zone_init_ref().
2775          */
2776         ref->zref_zone = NULL;
2777 }
2778 
2779 void
2780 zone_cred_hold(zone_t *z)
2781 {
2782         mutex_enter(&z->zone_lock);
2783         z->zone_cred_ref++;
2784         ASSERT(z->zone_cred_ref != 0);
2785         mutex_exit(&z->zone_lock);
2786 }
2787 
2788 void
2789 zone_cred_rele(zone_t *z)
2790 {
2791         boolean_t wakeup;
2792 
2793         mutex_enter(&z->zone_lock);
2794         ASSERT(z->zone_cred_ref != 0);
2795         z->zone_cred_ref--;
2796         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2797                 /* no more refs, free the structure */
2798                 mutex_exit(&z->zone_lock);
2799                 zone_free(z);
2800                 return;
2801         }
2802         /*
2803          * If zone_destroy is waiting for the cred references to drain
2804          * out, and they have, signal it.
2805          */
2806         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2807             zone_status_get(z) >= ZONE_IS_DEAD);
2808         mutex_exit(&z->zone_lock);
2809 
2810         if (wakeup) {
2811                 /*
2812                  * Grabbing zonehash_lock here effectively synchronizes with
2813                  * zone_destroy() to avoid missed signals.
2814                  */
2815                 mutex_enter(&zonehash_lock);
2816                 cv_broadcast(&zone_destroy_cv);
2817                 mutex_exit(&zonehash_lock);
2818         }
2819 }
2820 
2821 void
2822 zone_task_hold(zone_t *z)
2823 {
2824         mutex_enter(&z->zone_lock);
2825         z->zone_ntasks++;
2826         ASSERT(z->zone_ntasks != 0);
2827         mutex_exit(&z->zone_lock);
2828 }
2829 
2830 void
2831 zone_task_rele(zone_t *zone)
2832 {
2833         uint_t refcnt;
2834 
2835         mutex_enter(&zone->zone_lock);
2836         ASSERT(zone->zone_ntasks != 0);
2837         refcnt = --zone->zone_ntasks;
2838         if (refcnt > 1)      {       /* Common case */
2839                 mutex_exit(&zone->zone_lock);
2840                 return;
2841         }
2842         zone_hold_locked(zone); /* so we can use the zone_t later */
2843         mutex_exit(&zone->zone_lock);
2844         if (refcnt == 1) {
2845                 /*
2846                  * See if the zone is shutting down.
2847                  */
2848                 mutex_enter(&zone_status_lock);
2849                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2850                         goto out;
2851                 }
2852 
2853                 /*
2854                  * Make sure the ntasks didn't change since we
2855                  * dropped zone_lock.
2856                  */
2857                 mutex_enter(&zone->zone_lock);
2858                 if (refcnt != zone->zone_ntasks) {
2859                         mutex_exit(&zone->zone_lock);
2860                         goto out;
2861                 }
2862                 mutex_exit(&zone->zone_lock);
2863 
2864                 /*
2865                  * No more user processes in the zone.  The zone is empty.
2866                  */
2867                 zone_status_set(zone, ZONE_IS_EMPTY);
2868                 goto out;
2869         }
2870 
2871         ASSERT(refcnt == 0);
2872         /*
2873          * zsched has exited; the zone is dead.
2874          */
2875         zone->zone_zsched = NULL;            /* paranoia */
2876         mutex_enter(&zone_status_lock);
2877         zone_status_set(zone, ZONE_IS_DEAD);
2878 out:
2879         mutex_exit(&zone_status_lock);
2880         zone_rele(zone);
2881 }
2882 
2883 zoneid_t
2884 getzoneid(void)
2885 {
2886         return (curproc->p_zone->zone_id);
2887 }
2888 
2889 /*
2890  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2891  * check the validity of a zone's state.
2892  */
2893 static zone_t *
2894 zone_find_all_by_id(zoneid_t zoneid)
2895 {
2896         mod_hash_val_t hv;
2897         zone_t *zone = NULL;
2898 
2899         ASSERT(MUTEX_HELD(&zonehash_lock));
2900 
2901         if (mod_hash_find(zonehashbyid,
2902             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2903                 zone = (zone_t *)hv;
2904         return (zone);
2905 }
2906 
2907 static zone_t *
2908 zone_find_all_by_label(const ts_label_t *label)
2909 {
2910         mod_hash_val_t hv;
2911         zone_t *zone = NULL;
2912 
2913         ASSERT(MUTEX_HELD(&zonehash_lock));
2914 
2915         /*
2916          * zonehashbylabel is not maintained for unlabeled systems
2917          */
2918         if (!is_system_labeled())
2919                 return (NULL);
2920         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2921                 zone = (zone_t *)hv;
2922         return (zone);
2923 }
2924 
2925 static zone_t *
2926 zone_find_all_by_name(char *name)
2927 {
2928         mod_hash_val_t hv;
2929         zone_t *zone = NULL;
2930 
2931         ASSERT(MUTEX_HELD(&zonehash_lock));
2932 
2933         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2934                 zone = (zone_t *)hv;
2935         return (zone);
2936 }
2937 
2938 /*
2939  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2940  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2941  * Caller must call zone_rele() once it is done with the zone.
2942  *
2943  * The zone may begin the zone_destroy() sequence immediately after this
2944  * function returns, but may be safely used until zone_rele() is called.
2945  */
2946 zone_t *
2947 zone_find_by_id(zoneid_t zoneid)
2948 {
2949         zone_t *zone;
2950         zone_status_t status;
2951 
2952         mutex_enter(&zonehash_lock);
2953         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2954                 mutex_exit(&zonehash_lock);
2955                 return (NULL);
2956         }
2957         status = zone_status_get(zone);
2958         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2959                 /*
2960                  * For all practical purposes the zone doesn't exist.
2961                  */
2962                 mutex_exit(&zonehash_lock);
2963                 return (NULL);
2964         }
2965         zone_hold(zone);
2966         mutex_exit(&zonehash_lock);
2967         return (zone);
2968 }
2969 
2970 /*
2971  * Similar to zone_find_by_id, but using zone label as the key.
2972  */
2973 zone_t *
2974 zone_find_by_label(const ts_label_t *label)
2975 {
2976         zone_t *zone;
2977         zone_status_t status;
2978 
2979         mutex_enter(&zonehash_lock);
2980         if ((zone = zone_find_all_by_label(label)) == NULL) {
2981                 mutex_exit(&zonehash_lock);
2982                 return (NULL);
2983         }
2984 
2985         status = zone_status_get(zone);
2986         if (status > ZONE_IS_DOWN) {
2987                 /*
2988                  * For all practical purposes the zone doesn't exist.
2989                  */
2990                 mutex_exit(&zonehash_lock);
2991                 return (NULL);
2992         }
2993         zone_hold(zone);
2994         mutex_exit(&zonehash_lock);
2995         return (zone);
2996 }
2997 
2998 /*
2999  * Similar to zone_find_by_id, but using zone name as the key.
3000  */
3001 zone_t *
3002 zone_find_by_name(char *name)
3003 {
3004         zone_t *zone;
3005         zone_status_t status;
3006 
3007         mutex_enter(&zonehash_lock);
3008         if ((zone = zone_find_all_by_name(name)) == NULL) {
3009                 mutex_exit(&zonehash_lock);
3010                 return (NULL);
3011         }
3012         status = zone_status_get(zone);
3013         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3014                 /*
3015                  * For all practical purposes the zone doesn't exist.
3016                  */
3017                 mutex_exit(&zonehash_lock);
3018                 return (NULL);
3019         }
3020         zone_hold(zone);
3021         mutex_exit(&zonehash_lock);
3022         return (zone);
3023 }
3024 
3025 /*
3026  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3027  * if there is a zone "foo" rooted at /foo/root, and the path argument
3028  * is "/foo/root/proc", it will return the held zone_t corresponding to
3029  * zone "foo".
3030  *
3031  * zone_find_by_path() always returns a non-NULL value, since at the
3032  * very least every path will be contained in the global zone.
3033  *
3034  * As with the other zone_find_by_*() functions, the caller is
3035  * responsible for zone_rele()ing the return value of this function.
3036  */
3037 zone_t *
3038 zone_find_by_path(const char *path)
3039 {
3040         zone_t *zone;
3041         zone_t *zret = NULL;
3042         zone_status_t status;
3043 
3044         if (path == NULL) {
3045                 /*
3046                  * Call from rootconf().
3047                  */
3048                 zone_hold(global_zone);
3049                 return (global_zone);
3050         }
3051         ASSERT(*path == '/');
3052         mutex_enter(&zonehash_lock);
3053         for (zone = list_head(&zone_active); zone != NULL;
3054             zone = list_next(&zone_active, zone)) {
3055                 if (ZONE_PATH_VISIBLE(path, zone))
3056                         zret = zone;
3057         }
3058         ASSERT(zret != NULL);
3059         status = zone_status_get(zret);
3060         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3061                 /*
3062                  * Zone practically doesn't exist.
3063                  */
3064                 zret = global_zone;
3065         }
3066         zone_hold(zret);
3067         mutex_exit(&zonehash_lock);
3068         return (zret);
3069 }
3070 
3071 /*
3072  * Public interface for updating per-zone load averages.  Called once per
3073  * second.
3074  *
3075  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3076  */
3077 void
3078 zone_loadavg_update()
3079 {
3080         zone_t *zp;
3081         zone_status_t status;
3082         struct loadavg_s *lavg;
3083         hrtime_t zone_total;
3084         int i;
3085         hrtime_t hr_avg;
3086         int nrun;
3087         static int64_t f[3] = { 135, 27, 9 };
3088         int64_t q, r;
3089 
3090         mutex_enter(&zonehash_lock);
3091         for (zp = list_head(&zone_active); zp != NULL;
3092             zp = list_next(&zone_active, zp)) {
3093                 mutex_enter(&zp->zone_lock);
3094 
3095                 /* Skip zones that are on the way down or not yet up */
3096                 status = zone_status_get(zp);
3097                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3098                         /* For all practical purposes the zone doesn't exist. */
3099                         mutex_exit(&zp->zone_lock);
3100                         continue;
3101                 }
3102 
3103                 /*
3104                  * Update the 10 second moving average data in zone_loadavg.
3105                  */
3106                 lavg = &zp->zone_loadavg;
3107 
3108                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3109                 scalehrtime(&zone_total);
3110 
3111                 /* The zone_total should always be increasing. */
3112                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3113                     zone_total - lavg->lg_total : 0;
3114                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3115                 /* lg_total holds the prev. 1 sec. total */
3116                 lavg->lg_total = zone_total;
3117 
3118                 /*
3119                  * To simplify the calculation, we don't calculate the load avg.
3120                  * until the zone has been up for at least 10 seconds and our
3121                  * moving average is thus full.
3122                  */
3123                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3124                         lavg->lg_len++;
3125                         mutex_exit(&zp->zone_lock);
3126                         continue;
3127                 }
3128 
3129                 /* Now calculate the 1min, 5min, 15 min load avg. */
3130                 hr_avg = 0;
3131                 for (i = 0; i < S_LOADAVG_SZ; i++)
3132                         hr_avg += lavg->lg_loads[i];
3133                 hr_avg = hr_avg / S_LOADAVG_SZ;
3134                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3135 
3136                 /* Compute load avg. See comment in calcloadavg() */
3137                 for (i = 0; i < 3; i++) {
3138                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3139                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3140                         zp->zone_hp_avenrun[i] +=
3141                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3142 
3143                         /* avenrun[] can only hold 31 bits of load avg. */
3144                         if (zp->zone_hp_avenrun[i] <
3145                             ((uint64_t)1<<(31+16-FSHIFT)))
3146                                 zp->zone_avenrun[i] = (int32_t)
3147                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3148                         else
3149                                 zp->zone_avenrun[i] = 0x7fffffff;
3150                 }
3151 
3152                 mutex_exit(&zp->zone_lock);
3153         }
3154         mutex_exit(&zonehash_lock);
3155 }
3156 
3157 /*
3158  * Get the number of cpus visible to this zone.  The system-wide global
3159  * 'ncpus' is returned if pools are disabled, the caller is in the
3160  * global zone, or a NULL zone argument is passed in.
3161  */
3162 int
3163 zone_ncpus_get(zone_t *zone)
3164 {
3165         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3166 
3167         return (myncpus != 0 ? myncpus : ncpus);
3168 }
3169 
3170 /*
3171  * Get the number of online cpus visible to this zone.  The system-wide
3172  * global 'ncpus_online' is returned if pools are disabled, the caller
3173  * is in the global zone, or a NULL zone argument is passed in.
3174  */
3175 int
3176 zone_ncpus_online_get(zone_t *zone)
3177 {
3178         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3179 
3180         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3181 }
3182 
3183 /*
3184  * Return the pool to which the zone is currently bound.
3185  */
3186 pool_t *
3187 zone_pool_get(zone_t *zone)
3188 {
3189         ASSERT(pool_lock_held());
3190 
3191         return (zone->zone_pool);
3192 }
3193 
3194 /*
3195  * Set the zone's pool pointer and update the zone's visibility to match
3196  * the resources in the new pool.
3197  */
3198 void
3199 zone_pool_set(zone_t *zone, pool_t *pool)
3200 {
3201         ASSERT(pool_lock_held());
3202         ASSERT(MUTEX_HELD(&cpu_lock));
3203 
3204         zone->zone_pool = pool;
3205         zone_pset_set(zone, pool->pool_pset->pset_id);
3206 }
3207 
3208 /*
3209  * Return the cached value of the id of the processor set to which the
3210  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3211  * facility is disabled.
3212  */
3213 psetid_t
3214 zone_pset_get(zone_t *zone)
3215 {
3216         ASSERT(MUTEX_HELD(&cpu_lock));
3217 
3218         return (zone->zone_psetid);
3219 }
3220 
3221 /*
3222  * Set the cached value of the id of the processor set to which the zone
3223  * is currently bound.  Also update the zone's visibility to match the
3224  * resources in the new processor set.
3225  */
3226 void
3227 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3228 {
3229         psetid_t oldpsetid;
3230 
3231         ASSERT(MUTEX_HELD(&cpu_lock));
3232         oldpsetid = zone_pset_get(zone);
3233 
3234         if (oldpsetid == newpsetid)
3235                 return;
3236         /*
3237          * Global zone sees all.
3238          */
3239         if (zone != global_zone) {
3240                 zone->zone_psetid = newpsetid;
3241                 if (newpsetid != ZONE_PS_INVAL)
3242                         pool_pset_visibility_add(newpsetid, zone);
3243                 if (oldpsetid != ZONE_PS_INVAL)
3244                         pool_pset_visibility_remove(oldpsetid, zone);
3245         }
3246         /*
3247          * Disabling pools, so we should start using the global values
3248          * for ncpus and ncpus_online.
3249          */
3250         if (newpsetid == ZONE_PS_INVAL) {
3251                 zone->zone_ncpus = 0;
3252                 zone->zone_ncpus_online = 0;
3253         }
3254 }
3255 
3256 /*
3257  * Walk the list of active zones and issue the provided callback for
3258  * each of them.
3259  *
3260  * Caller must not be holding any locks that may be acquired under
3261  * zonehash_lock.  See comment at the beginning of the file for a list of
3262  * common locks and their interactions with zones.
3263  */
3264 int
3265 zone_walk(int (*cb)(zone_t *, void *), void *data)
3266 {
3267         zone_t *zone;
3268         int ret = 0;
3269         zone_status_t status;
3270 
3271         mutex_enter(&zonehash_lock);
3272         for (zone = list_head(&zone_active); zone != NULL;
3273             zone = list_next(&zone_active, zone)) {
3274                 /*
3275                  * Skip zones that shouldn't be externally visible.
3276                  */
3277                 status = zone_status_get(zone);
3278                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3279                         continue;
3280                 /*
3281                  * Bail immediately if any callback invocation returns a
3282                  * non-zero value.
3283                  */
3284                 ret = (*cb)(zone, data);
3285                 if (ret != 0)
3286                         break;
3287         }
3288         mutex_exit(&zonehash_lock);
3289         return (ret);
3290 }
3291 
3292 static int
3293 zone_set_root(zone_t *zone, const char *upath)
3294 {
3295         vnode_t *vp;
3296         int trycount;
3297         int error = 0;
3298         char *path;
3299         struct pathname upn, pn;
3300         size_t pathlen;
3301 
3302         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3303                 return (error);
3304 
3305         pn_alloc(&pn);
3306 
3307         /* prevent infinite loop */
3308         trycount = 10;
3309         for (;;) {
3310                 if (--trycount <= 0) {
3311                         error = ESTALE;
3312                         goto out;
3313                 }
3314 
3315                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3316                         /*
3317                          * VOP_ACCESS() may cover 'vp' with a new
3318                          * filesystem, if 'vp' is an autoFS vnode.
3319                          * Get the new 'vp' if so.
3320                          */
3321                         if ((error =
3322                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3323                             (!vn_ismntpt(vp) ||
3324                             (error = traverse(&vp)) == 0)) {
3325                                 pathlen = pn.pn_pathlen + 2;
3326                                 path = kmem_alloc(pathlen, KM_SLEEP);
3327                                 (void) strncpy(path, pn.pn_path,
3328                                     pn.pn_pathlen + 1);
3329                                 path[pathlen - 2] = '/';
3330                                 path[pathlen - 1] = '\0';
3331                                 pn_free(&pn);
3332                                 pn_free(&upn);
3333 
3334                                 /* Success! */
3335                                 break;
3336                         }
3337                         VN_RELE(vp);
3338                 }
3339                 if (error != ESTALE)
3340                         goto out;
3341         }
3342 
3343         ASSERT(error == 0);
3344         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3345         zone->zone_rootpath = path;
3346         zone->zone_rootpathlen = pathlen;
3347         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3348                 zone->zone_flags |= ZF_IS_SCRATCH;
3349         return (0);
3350 
3351 out:
3352         pn_free(&pn);
3353         pn_free(&upn);
3354         return (error);
3355 }
3356 
3357 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3358                         ((c) >= 'a' && (c) <= 'z') || \
3359                         ((c) >= 'A' && (c) <= 'Z'))
3360 
3361 static int
3362 zone_set_name(zone_t *zone, const char *uname)
3363 {
3364         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3365         size_t len;
3366         int i, err;
3367 
3368         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3369                 kmem_free(kname, ZONENAME_MAX);
3370                 return (err);   /* EFAULT or ENAMETOOLONG */
3371         }
3372 
3373         /* must be less than ZONENAME_MAX */
3374         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3375                 kmem_free(kname, ZONENAME_MAX);
3376                 return (EINVAL);
3377         }
3378 
3379         /*
3380          * Name must start with an alphanumeric and must contain only
3381          * alphanumerics, '-', '_' and '.'.
3382          */
3383         if (!isalnum(kname[0])) {
3384                 kmem_free(kname, ZONENAME_MAX);
3385                 return (EINVAL);
3386         }
3387         for (i = 1; i < len - 1; i++) {
3388                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3389                     kname[i] != '.') {
3390                         kmem_free(kname, ZONENAME_MAX);
3391                         return (EINVAL);
3392                 }
3393         }
3394 
3395         zone->zone_name = kname;
3396         return (0);
3397 }
3398 
3399 /*
3400  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3401  * is NULL or it points to a zone with no hostid emulation, then the machine's
3402  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3403  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3404  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3405  * hostid and the machine's hostid is invalid.
3406  */
3407 uint32_t
3408 zone_get_hostid(zone_t *zonep)
3409 {
3410         unsigned long machine_hostid;
3411 
3412         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3413                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3414                         return (HW_INVALID_HOSTID);
3415                 return ((uint32_t)machine_hostid);
3416         }
3417         return (zonep->zone_hostid);
3418 }
3419 
3420 /*
3421  * Similar to thread_create(), but makes sure the thread is in the appropriate
3422  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3423  */
3424 /*ARGSUSED*/
3425 kthread_t *
3426 zthread_create(
3427     caddr_t stk,
3428     size_t stksize,
3429     void (*proc)(),
3430     void *arg,
3431     size_t len,
3432     pri_t pri)
3433 {
3434         kthread_t *t;
3435         zone_t *zone = curproc->p_zone;
3436         proc_t *pp = zone->zone_zsched;
3437 
3438         zone_hold(zone);        /* Reference to be dropped when thread exits */
3439 
3440         /*
3441          * No-one should be trying to create threads if the zone is shutting
3442          * down and there aren't any kernel threads around.  See comment
3443          * in zthread_exit().
3444          */
3445         ASSERT(!(zone->zone_kthreads == NULL &&
3446             zone_status_get(zone) >= ZONE_IS_EMPTY));
3447         /*
3448          * Create a thread, but don't let it run until we've finished setting
3449          * things up.
3450          */
3451         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3452         ASSERT(t->t_forw == NULL);
3453         mutex_enter(&zone_status_lock);
3454         if (zone->zone_kthreads == NULL) {
3455                 t->t_forw = t->t_back = t;
3456         } else {
3457                 kthread_t *tx = zone->zone_kthreads;
3458 
3459                 t->t_forw = tx;
3460                 t->t_back = tx->t_back;
3461                 tx->t_back->t_forw = t;
3462                 tx->t_back = t;
3463         }
3464         zone->zone_kthreads = t;
3465         mutex_exit(&zone_status_lock);
3466 
3467         mutex_enter(&pp->p_lock);
3468         t->t_proc_flag |= TP_ZTHREAD;
3469         project_rele(t->t_proj);
3470         t->t_proj = project_hold(pp->p_task->tk_proj);
3471 
3472         /*
3473          * Setup complete, let it run.
3474          */
3475         thread_lock(t);
3476         t->t_schedflag |= TS_ALLSTART;
3477         setrun_locked(t);
3478         thread_unlock(t);
3479 
3480         mutex_exit(&pp->p_lock);
3481 
3482         return (t);
3483 }
3484 
3485 /*
3486  * Similar to thread_exit().  Must be called by threads created via
3487  * zthread_exit().
3488  */
3489 void
3490 zthread_exit(void)
3491 {
3492         kthread_t *t = curthread;
3493         proc_t *pp = curproc;
3494         zone_t *zone = pp->p_zone;
3495 
3496         mutex_enter(&zone_status_lock);
3497 
3498         /*
3499          * Reparent to p0
3500          */
3501         kpreempt_disable();
3502         mutex_enter(&pp->p_lock);
3503         t->t_proc_flag &= ~TP_ZTHREAD;
3504         t->t_procp = &p0;
3505         hat_thread_exit(t);
3506         mutex_exit(&pp->p_lock);
3507         kpreempt_enable();
3508 
3509         if (t->t_back == t) {
3510                 ASSERT(t->t_forw == t);
3511                 /*
3512                  * If the zone is empty, once the thread count
3513                  * goes to zero no further kernel threads can be
3514                  * created.  This is because if the creator is a process
3515                  * in the zone, then it must have exited before the zone
3516                  * state could be set to ZONE_IS_EMPTY.
3517                  * Otherwise, if the creator is a kernel thread in the
3518                  * zone, the thread count is non-zero.
3519                  *
3520                  * This really means that non-zone kernel threads should
3521                  * not create zone kernel threads.
3522                  */
3523                 zone->zone_kthreads = NULL;
3524                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3525                         zone_status_set(zone, ZONE_IS_DOWN);
3526                         /*
3527                          * Remove any CPU caps on this zone.
3528                          */
3529                         cpucaps_zone_remove(zone);
3530                 }
3531         } else {
3532                 t->t_forw->t_back = t->t_back;
3533                 t->t_back->t_forw = t->t_forw;
3534                 if (zone->zone_kthreads == t)
3535                         zone->zone_kthreads = t->t_forw;
3536         }
3537         mutex_exit(&zone_status_lock);
3538         zone_rele(zone);
3539         thread_exit();
3540         /* NOTREACHED */
3541 }
3542 
3543 static void
3544 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3545 {
3546         vnode_t *oldvp;
3547 
3548         /* we're going to hold a reference here to the directory */
3549         VN_HOLD(vp);
3550 
3551         /* update abs cwd/root path see c2/audit.c */
3552         if (AU_AUDITING())
3553                 audit_chdirec(vp, vpp);
3554 
3555         mutex_enter(&pp->p_lock);
3556         oldvp = *vpp;
3557         *vpp = vp;
3558         mutex_exit(&pp->p_lock);
3559         if (oldvp != NULL)
3560                 VN_RELE(oldvp);
3561 }
3562 
3563 /*
3564  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3565  */
3566 static int
3567 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3568 {
3569         nvpair_t *nvp = NULL;
3570         boolean_t priv_set = B_FALSE;
3571         boolean_t limit_set = B_FALSE;
3572         boolean_t action_set = B_FALSE;
3573 
3574         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3575                 const char *name;
3576                 uint64_t ui64;
3577 
3578                 name = nvpair_name(nvp);
3579                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3580                         return (EINVAL);
3581                 (void) nvpair_value_uint64(nvp, &ui64);
3582                 if (strcmp(name, "privilege") == 0) {
3583                         /*
3584                          * Currently only privileged values are allowed, but
3585                          * this may change in the future.
3586                          */
3587                         if (ui64 != RCPRIV_PRIVILEGED)
3588                                 return (EINVAL);
3589                         rv->rcv_privilege = ui64;
3590                         priv_set = B_TRUE;
3591                 } else if (strcmp(name, "limit") == 0) {
3592                         rv->rcv_value = ui64;
3593                         limit_set = B_TRUE;
3594                 } else if (strcmp(name, "action") == 0) {
3595                         if (ui64 != RCTL_LOCAL_NOACTION &&
3596                             ui64 != RCTL_LOCAL_DENY)
3597                                 return (EINVAL);
3598                         rv->rcv_flagaction = ui64;
3599                         action_set = B_TRUE;
3600                 } else {
3601                         return (EINVAL);
3602                 }
3603         }
3604 
3605         if (!(priv_set && limit_set && action_set))
3606                 return (EINVAL);
3607         rv->rcv_action_signal = 0;
3608         rv->rcv_action_recipient = NULL;
3609         rv->rcv_action_recip_pid = -1;
3610         rv->rcv_firing_time = 0;
3611 
3612         return (0);
3613 }
3614 
3615 /*
3616  * Non-global zone version of start_init.
3617  */
3618 void
3619 zone_start_init(void)
3620 {
3621         proc_t *p = ttoproc(curthread);
3622         zone_t *z = p->p_zone;
3623 
3624         ASSERT(!INGLOBALZONE(curproc));
3625 
3626         /*
3627          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3628          * storing just the pid of init is sufficient.
3629          */
3630         z->zone_proc_initpid = p->p_pid;
3631 
3632         /*
3633          * We maintain zone_boot_err so that we can return the cause of the
3634          * failure back to the caller of the zone_boot syscall.
3635          */
3636         p->p_zone->zone_boot_err = start_init_common();
3637 
3638         /*
3639          * We will prevent booting zones from becoming running zones if the
3640          * global zone is shutting down.
3641          */
3642         mutex_enter(&zone_status_lock);
3643         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3644             ZONE_IS_SHUTTING_DOWN) {
3645                 /*
3646                  * Make sure we are still in the booting state-- we could have
3647                  * raced and already be shutting down, or even further along.
3648                  */
3649                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3650                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3651                 }
3652                 mutex_exit(&zone_status_lock);
3653                 /* It's gone bad, dispose of the process */
3654                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3655                         mutex_enter(&p->p_lock);
3656                         ASSERT(p->p_flag & SEXITLWPS);
3657                         lwp_exit();
3658                 }
3659         } else {
3660                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3661                         zone_status_set(z, ZONE_IS_RUNNING);
3662                 mutex_exit(&zone_status_lock);
3663                 /* cause the process to return to userland. */
3664                 lwp_rtt();
3665         }
3666 }
3667 
3668 struct zsched_arg {
3669         zone_t *zone;
3670         nvlist_t *nvlist;
3671 };
3672 
3673 /*
3674  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3675  * anything to do with scheduling, but rather with the fact that
3676  * per-zone kernel threads are parented to zsched, just like regular
3677  * kernel threads are parented to sched (p0).
3678  *
3679  * zsched is also responsible for launching init for the zone.
3680  */
3681 static void
3682 zsched(void *arg)
3683 {
3684         struct zsched_arg *za = arg;
3685         proc_t *pp = curproc;
3686         proc_t *initp = proc_init;
3687         zone_t *zone = za->zone;
3688         cred_t *cr, *oldcred;
3689         rctl_set_t *set;
3690         rctl_alloc_gp_t *gp;
3691         contract_t *ct = NULL;
3692         task_t *tk, *oldtk;
3693         rctl_entity_p_t e;
3694         kproject_t *pj;
3695 
3696         nvlist_t *nvl = za->nvlist;
3697         nvpair_t *nvp = NULL;
3698 
3699         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3700         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3701         PTOU(pp)->u_argc = 0;
3702         PTOU(pp)->u_argv = NULL;
3703         PTOU(pp)->u_envp = NULL;
3704         closeall(P_FINFO(pp));
3705 
3706         /*
3707          * We are this zone's "zsched" process.  As the zone isn't generally
3708          * visible yet we don't need to grab any locks before initializing its
3709          * zone_proc pointer.
3710          */
3711         zone_hold(zone);  /* this hold is released by zone_destroy() */
3712         zone->zone_zsched = pp;
3713         mutex_enter(&pp->p_lock);
3714         pp->p_zone = zone;
3715         mutex_exit(&pp->p_lock);
3716 
3717         /*
3718          * Disassociate process from its 'parent'; parent ourselves to init
3719          * (pid 1) and change other values as needed.
3720          */
3721         sess_create();
3722 
3723         mutex_enter(&pidlock);
3724         proc_detach(pp);
3725         pp->p_ppid = 1;
3726         pp->p_flag |= SZONETOP;
3727         pp->p_ancpid = 1;
3728         pp->p_parent = initp;
3729         pp->p_psibling = NULL;
3730         if (initp->p_child)
3731                 initp->p_child->p_psibling = pp;
3732         pp->p_sibling = initp->p_child;
3733         initp->p_child = pp;
3734 
3735         /* Decrement what newproc() incremented. */
3736         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3737         /*
3738          * Our credentials are about to become kcred-like, so we don't care
3739          * about the caller's ruid.
3740          */
3741         upcount_inc(crgetruid(kcred), zone->zone_id);
3742         mutex_exit(&pidlock);
3743 
3744         /*
3745          * getting out of global zone, so decrement lwp and process counts
3746          */
3747         pj = pp->p_task->tk_proj;
3748         mutex_enter(&global_zone->zone_nlwps_lock);
3749         pj->kpj_nlwps -= pp->p_lwpcnt;
3750         global_zone->zone_nlwps -= pp->p_lwpcnt;
3751         pj->kpj_nprocs--;
3752         global_zone->zone_nprocs--;
3753         mutex_exit(&global_zone->zone_nlwps_lock);
3754 
3755         /*
3756          * Decrement locked memory counts on old zone and project.
3757          */
3758         mutex_enter(&global_zone->zone_mem_lock);
3759         global_zone->zone_locked_mem -= pp->p_locked_mem;
3760         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3761         mutex_exit(&global_zone->zone_mem_lock);
3762 
3763         /*
3764          * Create and join a new task in project '0' of this zone.
3765          *
3766          * We don't need to call holdlwps() since we know we're the only lwp in
3767          * this process.
3768          *
3769          * task_join() returns with p_lock held.
3770          */
3771         tk = task_create(0, zone);
3772         mutex_enter(&cpu_lock);
3773         oldtk = task_join(tk, 0);
3774 
3775         pj = pp->p_task->tk_proj;
3776 
3777         mutex_enter(&zone->zone_mem_lock);
3778         zone->zone_locked_mem += pp->p_locked_mem;
3779         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3780         mutex_exit(&zone->zone_mem_lock);
3781 
3782         /*
3783          * add lwp and process counts to zsched's zone, and increment
3784          * project's task and process count due to the task created in
3785          * the above task_create.
3786          */
3787         mutex_enter(&zone->zone_nlwps_lock);
3788         pj->kpj_nlwps += pp->p_lwpcnt;
3789         pj->kpj_ntasks += 1;
3790         zone->zone_nlwps += pp->p_lwpcnt;
3791         pj->kpj_nprocs++;
3792         zone->zone_nprocs++;
3793         mutex_exit(&zone->zone_nlwps_lock);
3794 
3795         mutex_exit(&curproc->p_lock);
3796         mutex_exit(&cpu_lock);
3797         task_rele(oldtk);
3798 
3799         /*
3800          * The process was created by a process in the global zone, hence the
3801          * credentials are wrong.  We might as well have kcred-ish credentials.
3802          */
3803         cr = zone->zone_kcred;
3804         crhold(cr);
3805         mutex_enter(&pp->p_crlock);
3806         oldcred = pp->p_cred;
3807         pp->p_cred = cr;
3808         mutex_exit(&pp->p_crlock);
3809         crfree(oldcred);
3810 
3811         /*
3812          * Hold credentials again (for thread)
3813          */
3814         crhold(cr);
3815 
3816         /*
3817          * p_lwpcnt can't change since this is a kernel process.
3818          */
3819         crset(pp, cr);
3820 
3821         /*
3822          * Chroot
3823          */
3824         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3825         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3826 
3827         /*
3828          * Initialize zone's rctl set.
3829          */
3830         set = rctl_set_create();
3831         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3832         mutex_enter(&pp->p_lock);
3833         e.rcep_p.zone = zone;
3834         e.rcep_t = RCENTITY_ZONE;
3835         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3836         mutex_exit(&pp->p_lock);
3837         rctl_prealloc_destroy(gp);
3838 
3839         /*
3840          * Apply the rctls passed in to zone_create().  This is basically a list
3841          * assignment: all of the old values are removed and the new ones
3842          * inserted.  That is, if an empty list is passed in, all values are
3843          * removed.
3844          */
3845         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3846                 rctl_dict_entry_t *rde;
3847                 rctl_hndl_t hndl;
3848                 char *name;
3849                 nvlist_t **nvlarray;
3850                 uint_t i, nelem;
3851                 int error;      /* For ASSERT()s */
3852 
3853                 name = nvpair_name(nvp);
3854                 hndl = rctl_hndl_lookup(name);
3855                 ASSERT(hndl != -1);
3856                 rde = rctl_dict_lookup_hndl(hndl);
3857                 ASSERT(rde != NULL);
3858 
3859                 for (; /* ever */; ) {
3860                         rctl_val_t oval;
3861 
3862                         mutex_enter(&pp->p_lock);
3863                         error = rctl_local_get(hndl, NULL, &oval, pp);
3864                         mutex_exit(&pp->p_lock);
3865                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3866                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3867                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3868                                 break;
3869                         mutex_enter(&pp->p_lock);
3870                         error = rctl_local_delete(hndl, &oval, pp);
3871                         mutex_exit(&pp->p_lock);
3872                         ASSERT(error == 0);
3873                 }
3874                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3875                 ASSERT(error == 0);
3876                 for (i = 0; i < nelem; i++) {
3877                         rctl_val_t *nvalp;
3878 
3879                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3880                         error = nvlist2rctlval(nvlarray[i], nvalp);
3881                         ASSERT(error == 0);
3882                         /*
3883                          * rctl_local_insert can fail if the value being
3884                          * inserted is a duplicate; this is OK.
3885                          */
3886                         mutex_enter(&pp->p_lock);
3887                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
3888                                 kmem_cache_free(rctl_val_cache, nvalp);
3889                         mutex_exit(&pp->p_lock);
3890                 }
3891         }
3892         /*
3893          * Tell the world that we're done setting up.
3894          *
3895          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3896          * and atomically set the zone's processor set visibility.  Once
3897          * we drop pool_lock() this zone will automatically get updated
3898          * to reflect any future changes to the pools configuration.
3899          *
3900          * Note that after we drop the locks below (zonehash_lock in
3901          * particular) other operations such as a zone_getattr call can
3902          * now proceed and observe the zone. That is the reason for doing a
3903          * state transition to the INITIALIZED state.
3904          */
3905         pool_lock();
3906         mutex_enter(&cpu_lock);
3907         mutex_enter(&zonehash_lock);
3908         zone_uniqid(zone);
3909         zone_zsd_configure(zone);
3910         if (pool_state == POOL_ENABLED)
3911                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
3912         mutex_enter(&zone_status_lock);
3913         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3914         zone_status_set(zone, ZONE_IS_INITIALIZED);
3915         mutex_exit(&zone_status_lock);
3916         mutex_exit(&zonehash_lock);
3917         mutex_exit(&cpu_lock);
3918         pool_unlock();
3919 
3920         /* Now call the create callback for this key */
3921         zsd_apply_all_keys(zsd_apply_create, zone);
3922 
3923         /* The callbacks are complete. Mark ZONE_IS_READY */
3924         mutex_enter(&zone_status_lock);
3925         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3926         zone_status_set(zone, ZONE_IS_READY);
3927         mutex_exit(&zone_status_lock);
3928 
3929         /*
3930          * Once we see the zone transition to the ZONE_IS_BOOTING state,
3931          * we launch init, and set the state to running.
3932          */
3933         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3934 
3935         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3936                 id_t cid;
3937 
3938                 /*
3939                  * Ok, this is a little complicated.  We need to grab the
3940                  * zone's pool's scheduling class ID; note that by now, we
3941                  * are already bound to a pool if we need to be (zoneadmd
3942                  * will have done that to us while we're in the READY
3943                  * state).  *But* the scheduling class for the zone's 'init'
3944                  * must be explicitly passed to newproc, which doesn't
3945                  * respect pool bindings.
3946                  *
3947                  * We hold the pool_lock across the call to newproc() to
3948                  * close the obvious race: the pool's scheduling class
3949                  * could change before we manage to create the LWP with
3950                  * classid 'cid'.
3951                  */
3952                 pool_lock();
3953                 if (zone->zone_defaultcid > 0)
3954                         cid = zone->zone_defaultcid;
3955                 else
3956                         cid = pool_get_class(zone->zone_pool);
3957                 if (cid == -1)
3958                         cid = defaultcid;
3959 
3960                 /*
3961                  * If this fails, zone_boot will ultimately fail.  The
3962                  * state of the zone will be set to SHUTTING_DOWN-- userland
3963                  * will have to tear down the zone, and fail, or try again.
3964                  */
3965                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3966                     minclsyspri - 1, &ct, 0)) != 0) {
3967                         mutex_enter(&zone_status_lock);
3968                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3969                         mutex_exit(&zone_status_lock);
3970                 } else {
3971                         zone->zone_boot_time = gethrestime_sec();
3972                         zone->zone_boot_hrtime = gethrtime();
3973                 }
3974 
3975                 pool_unlock();
3976         }
3977 
3978         /*
3979          * Wait for zone_destroy() to be called.  This is what we spend
3980          * most of our life doing.
3981          */
3982         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3983 
3984         if (ct)
3985                 /*
3986                  * At this point the process contract should be empty.
3987                  * (Though if it isn't, it's not the end of the world.)
3988                  */
3989                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3990 
3991         /*
3992          * Allow kcred to be freed when all referring processes
3993          * (including this one) go away.  We can't just do this in
3994          * zone_free because we need to wait for the zone_cred_ref to
3995          * drop to 0 before calling zone_free, and the existence of
3996          * zone_kcred will prevent that.  Thus, we call crfree here to
3997          * balance the crdup in zone_create.  The crhold calls earlier
3998          * in zsched will be dropped when the thread and process exit.
3999          */
4000         crfree(zone->zone_kcred);
4001         zone->zone_kcred = NULL;
4002 
4003         exit(CLD_EXITED, 0);
4004 }
4005 
4006 /*
4007  * Helper function to determine if there are any submounts of the
4008  * provided path.  Used to make sure the zone doesn't "inherit" any
4009  * mounts from before it is created.
4010  */
4011 static uint_t
4012 zone_mount_count(const char *rootpath)
4013 {
4014         vfs_t *vfsp;
4015         uint_t count = 0;
4016         size_t rootpathlen = strlen(rootpath);
4017 
4018         /*
4019          * Holding zonehash_lock prevents race conditions with
4020          * vfs_list_add()/vfs_list_remove() since we serialize with
4021          * zone_find_by_path().
4022          */
4023         ASSERT(MUTEX_HELD(&zonehash_lock));
4024         /*
4025          * The rootpath must end with a '/'
4026          */
4027         ASSERT(rootpath[rootpathlen - 1] == '/');
4028 
4029         /*
4030          * This intentionally does not count the rootpath itself if that
4031          * happens to be a mount point.
4032          */
4033         vfs_list_read_lock();
4034         vfsp = rootvfs;
4035         do {
4036                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4037                     rootpathlen) == 0)
4038                         count++;
4039                 vfsp = vfsp->vfs_next;
4040         } while (vfsp != rootvfs);
4041         vfs_list_unlock();
4042         return (count);
4043 }
4044 
4045 /*
4046  * Helper function to make sure that a zone created on 'rootpath'
4047  * wouldn't end up containing other zones' rootpaths.
4048  */
4049 static boolean_t
4050 zone_is_nested(const char *rootpath)
4051 {
4052         zone_t *zone;
4053         size_t rootpathlen = strlen(rootpath);
4054         size_t len;
4055 
4056         ASSERT(MUTEX_HELD(&zonehash_lock));
4057 
4058         /*
4059          * zone_set_root() appended '/' and '\0' at the end of rootpath
4060          */
4061         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4062             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4063                 return (B_TRUE);
4064 
4065         for (zone = list_head(&zone_active); zone != NULL;
4066             zone = list_next(&zone_active, zone)) {
4067                 if (zone == global_zone)
4068                         continue;
4069                 len = strlen(zone->zone_rootpath);
4070                 if (strncmp(rootpath, zone->zone_rootpath,
4071                     MIN(rootpathlen, len)) == 0)
4072                         return (B_TRUE);
4073         }
4074         return (B_FALSE);
4075 }
4076 
4077 static int
4078 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4079     size_t zone_privssz)
4080 {
4081         priv_set_t *privs;
4082 
4083         if (zone_privssz < sizeof (priv_set_t))
4084                 return (ENOMEM);
4085 
4086         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4087 
4088         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4089                 kmem_free(privs, sizeof (priv_set_t));
4090                 return (EFAULT);
4091         }
4092 
4093         zone->zone_privset = privs;
4094         return (0);
4095 }
4096 
4097 /*
4098  * We make creative use of nvlists to pass in rctls from userland.  The list is
4099  * a list of the following structures:
4100  *
4101  * (name = rctl_name, value = nvpair_list_array)
4102  *
4103  * Where each element of the nvpair_list_array is of the form:
4104  *
4105  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4106  *      (name = "limit", value = uint64_t),
4107  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4108  */
4109 static int
4110 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4111 {
4112         nvpair_t *nvp = NULL;
4113         nvlist_t *nvl = NULL;
4114         char *kbuf;
4115         int error;
4116         rctl_val_t rv;
4117 
4118         *nvlp = NULL;
4119 
4120         if (buflen == 0)
4121                 return (0);
4122 
4123         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4124                 return (ENOMEM);
4125         if (copyin(ubuf, kbuf, buflen)) {
4126                 error = EFAULT;
4127                 goto out;
4128         }
4129         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4130                 /*
4131                  * nvl may have been allocated/free'd, but the value set to
4132                  * non-NULL, so we reset it here.
4133                  */
4134                 nvl = NULL;
4135                 error = EINVAL;
4136                 goto out;
4137         }
4138         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4139                 rctl_dict_entry_t *rde;
4140                 rctl_hndl_t hndl;
4141                 nvlist_t **nvlarray;
4142                 uint_t i, nelem;
4143                 char *name;
4144 
4145                 error = EINVAL;
4146                 name = nvpair_name(nvp);
4147                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4148                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4149                         goto out;
4150                 }
4151                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4152                         goto out;
4153                 }
4154                 rde = rctl_dict_lookup_hndl(hndl);
4155                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4156                 ASSERT(error == 0);
4157                 for (i = 0; i < nelem; i++) {
4158                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4159                                 goto out;
4160                 }
4161                 if (rctl_invalid_value(rde, &rv)) {
4162                         error = EINVAL;
4163                         goto out;
4164                 }
4165         }
4166         error = 0;
4167         *nvlp = nvl;
4168 out:
4169         kmem_free(kbuf, buflen);
4170         if (error && nvl != NULL)
4171                 nvlist_free(nvl);
4172         return (error);
4173 }
4174 
4175 int
4176 zone_create_error(int er_error, int er_ext, int *er_out) {
4177         if (er_out != NULL) {
4178                 if (copyout(&er_ext, er_out, sizeof (int))) {
4179                         return (set_errno(EFAULT));
4180                 }
4181         }
4182         return (set_errno(er_error));
4183 }
4184 
4185 static int
4186 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4187 {
4188         ts_label_t *tsl;
4189         bslabel_t blab;
4190 
4191         /* Get label from user */
4192         if (copyin(lab, &blab, sizeof (blab)) != 0)
4193                 return (EFAULT);
4194         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4195         if (tsl == NULL)
4196                 return (ENOMEM);
4197 
4198         zone->zone_slabel = tsl;
4199         return (0);
4200 }
4201 
4202 /*
4203  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4204  */
4205 static int
4206 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4207 {
4208         char *kbuf;
4209         char *dataset, *next;
4210         zone_dataset_t *zd;
4211         size_t len;
4212 
4213         if (ubuf == NULL || buflen == 0)
4214                 return (0);
4215 
4216         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4217                 return (ENOMEM);
4218 
4219         if (copyin(ubuf, kbuf, buflen) != 0) {
4220                 kmem_free(kbuf, buflen);
4221                 return (EFAULT);
4222         }
4223 
4224         dataset = next = kbuf;
4225         for (;;) {
4226                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4227 
4228                 next = strchr(dataset, ',');
4229 
4230                 if (next == NULL)
4231                         len = strlen(dataset);
4232                 else
4233                         len = next - dataset;
4234 
4235                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4236                 bcopy(dataset, zd->zd_dataset, len);
4237                 zd->zd_dataset[len] = '\0';
4238 
4239                 list_insert_head(&zone->zone_datasets, zd);
4240 
4241                 if (next == NULL)
4242                         break;
4243 
4244                 dataset = next + 1;
4245         }
4246 
4247         kmem_free(kbuf, buflen);
4248         return (0);
4249 }
4250 
4251 /*
4252  * System call to create/initialize a new zone named 'zone_name', rooted
4253  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4254  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4255  * with labeling set by 'match', 'doi', and 'label'.
4256  *
4257  * If extended error is non-null, we may use it to return more detailed
4258  * error information.
4259  */
4260 static zoneid_t
4261 zone_create(const char *zone_name, const char *zone_root,
4262     const priv_set_t *zone_privs, size_t zone_privssz,
4263     caddr_t rctlbuf, size_t rctlbufsz,
4264     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4265     int match, uint32_t doi, const bslabel_t *label,
4266     int flags)
4267 {
4268         struct zsched_arg zarg;
4269         nvlist_t *rctls = NULL;
4270         proc_t *pp = curproc;
4271         zone_t *zone, *ztmp;
4272         zoneid_t zoneid;
4273         int error;
4274         int error2 = 0;
4275         char *str;
4276         cred_t *zkcr;
4277         boolean_t insert_label_hash;
4278 
4279         if (secpolicy_zone_config(CRED()) != 0)
4280                 return (set_errno(EPERM));
4281 
4282         /* can't boot zone from within chroot environment */
4283         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4284                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4285                     extended_error));
4286 
4287         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4288         zoneid = zone->zone_id = id_alloc(zoneid_space);
4289         zone->zone_status = ZONE_IS_UNINITIALIZED;
4290         zone->zone_pool = pool_default;
4291         zone->zone_pool_mod = gethrtime();
4292         zone->zone_psetid = ZONE_PS_INVAL;
4293         zone->zone_ncpus = 0;
4294         zone->zone_ncpus_online = 0;
4295         zone->zone_restart_init = B_TRUE;
4296         zone->zone_brand = &native_brand;
4297         zone->zone_initname = NULL;
4298         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4299         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4300         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4301         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4302         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4303             offsetof(zone_ref_t, zref_linkage));
4304         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4305             offsetof(struct zsd_entry, zsd_linkage));
4306         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4307             offsetof(zone_dataset_t, zd_linkage));
4308         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4309             offsetof(zone_dl_t, zdl_linkage));
4310         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4311         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4312 
4313         if (flags & ZCF_NET_EXCL) {
4314                 zone->zone_flags |= ZF_NET_EXCL;
4315         }
4316 
4317         if ((error = zone_set_name(zone, zone_name)) != 0) {
4318                 zone_free(zone);
4319                 return (zone_create_error(error, 0, extended_error));
4320         }
4321 
4322         if ((error = zone_set_root(zone, zone_root)) != 0) {
4323                 zone_free(zone);
4324                 return (zone_create_error(error, 0, extended_error));
4325         }
4326         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4327                 zone_free(zone);
4328                 return (zone_create_error(error, 0, extended_error));
4329         }
4330 
4331         /* initialize node name to be the same as zone name */
4332         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4333         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4334         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4335 
4336         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4337         zone->zone_domain[0] = '\0';
4338         zone->zone_hostid = HW_INVALID_HOSTID;
4339         zone->zone_shares = 1;
4340         zone->zone_shmmax = 0;
4341         zone->zone_ipc.ipcq_shmmni = 0;
4342         zone->zone_ipc.ipcq_semmni = 0;
4343         zone->zone_ipc.ipcq_msgmni = 0;
4344         zone->zone_bootargs = NULL;
4345         zone->zone_fs_allowed = NULL;
4346         zone->zone_initname =
4347             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4348         (void) strcpy(zone->zone_initname, zone_default_initname);
4349         zone->zone_nlwps = 0;
4350         zone->zone_nlwps_ctl = INT_MAX;
4351         zone->zone_nprocs = 0;
4352         zone->zone_nprocs_ctl = INT_MAX;
4353         zone->zone_locked_mem = 0;
4354         zone->zone_locked_mem_ctl = UINT64_MAX;
4355         zone->zone_max_swap = 0;
4356         zone->zone_max_swap_ctl = UINT64_MAX;
4357         zone->zone_max_lofi = 0;
4358         zone->zone_max_lofi_ctl = UINT64_MAX;
4359         zone0.zone_lockedmem_kstat = NULL;
4360         zone0.zone_swapresv_kstat = NULL;
4361 
4362         /*
4363          * Zsched initializes the rctls.
4364          */
4365         zone->zone_rctls = NULL;
4366 
4367         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4368                 zone_free(zone);
4369                 return (zone_create_error(error, 0, extended_error));
4370         }
4371 
4372         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4373                 zone_free(zone);
4374                 return (set_errno(error));
4375         }
4376 
4377         /*
4378          * Read in the trusted system parameters:
4379          * match flag and sensitivity label.
4380          */
4381         zone->zone_match = match;
4382         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4383                 /* Fail if requested to set doi to anything but system's doi */
4384                 if (doi != 0 && doi != default_doi) {
4385                         zone_free(zone);
4386                         return (set_errno(EINVAL));
4387                 }
4388                 /* Always apply system's doi to the zone */
4389                 error = zone_set_label(zone, label, default_doi);
4390                 if (error != 0) {
4391                         zone_free(zone);
4392                         return (set_errno(error));
4393                 }
4394                 insert_label_hash = B_TRUE;
4395         } else {
4396                 /* all zones get an admin_low label if system is not labeled */
4397                 zone->zone_slabel = l_admin_low;
4398                 label_hold(l_admin_low);
4399                 insert_label_hash = B_FALSE;
4400         }
4401 
4402         /*
4403          * Stop all lwps since that's what normally happens as part of fork().
4404          * This needs to happen before we grab any locks to avoid deadlock
4405          * (another lwp in the process could be waiting for the held lock).
4406          */
4407         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4408                 zone_free(zone);
4409                 if (rctls)
4410                         nvlist_free(rctls);
4411                 return (zone_create_error(error, 0, extended_error));
4412         }
4413 
4414         if (block_mounts(zone) == 0) {
4415                 mutex_enter(&pp->p_lock);
4416                 if (curthread != pp->p_agenttp)
4417                         continuelwps(pp);
4418                 mutex_exit(&pp->p_lock);
4419                 zone_free(zone);
4420                 if (rctls)
4421                         nvlist_free(rctls);
4422                 return (zone_create_error(error, 0, extended_error));
4423         }
4424 
4425         /*
4426          * Set up credential for kernel access.  After this, any errors
4427          * should go through the dance in errout rather than calling
4428          * zone_free directly.
4429          */
4430         zone->zone_kcred = crdup(kcred);
4431         crsetzone(zone->zone_kcred, zone);
4432         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4433         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4434         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4435         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4436 
4437         mutex_enter(&zonehash_lock);
4438         /*
4439          * Make sure zone doesn't already exist.
4440          *
4441          * If the system and zone are labeled,
4442          * make sure no other zone exists that has the same label.
4443          */
4444         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4445             (insert_label_hash &&
4446             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4447                 zone_status_t status;
4448 
4449                 status = zone_status_get(ztmp);
4450                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4451                         error = EEXIST;
4452                 else
4453                         error = EBUSY;
4454 
4455                 if (insert_label_hash)
4456                         error2 = ZE_LABELINUSE;
4457 
4458                 goto errout;
4459         }
4460 
4461         /*
4462          * Don't allow zone creations which would cause one zone's rootpath to
4463          * be accessible from that of another (non-global) zone.
4464          */
4465         if (zone_is_nested(zone->zone_rootpath)) {
4466                 error = EBUSY;
4467                 goto errout;
4468         }
4469 
4470         ASSERT(zonecount != 0);         /* check for leaks */
4471         if (zonecount + 1 > maxzones) {
4472                 error = ENOMEM;
4473                 goto errout;
4474         }
4475 
4476         if (zone_mount_count(zone->zone_rootpath) != 0) {
4477                 error = EBUSY;
4478                 error2 = ZE_AREMOUNTS;
4479                 goto errout;
4480         }
4481 
4482         /*
4483          * Zone is still incomplete, but we need to drop all locks while
4484          * zsched() initializes this zone's kernel process.  We
4485          * optimistically add the zone to the hashtable and associated
4486          * lists so a parallel zone_create() doesn't try to create the
4487          * same zone.
4488          */
4489         zonecount++;
4490         (void) mod_hash_insert(zonehashbyid,
4491             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4492             (mod_hash_val_t)(uintptr_t)zone);
4493         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4494         (void) strcpy(str, zone->zone_name);
4495         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4496             (mod_hash_val_t)(uintptr_t)zone);
4497         if (insert_label_hash) {
4498                 (void) mod_hash_insert(zonehashbylabel,
4499                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4500                 zone->zone_flags |= ZF_HASHED_LABEL;
4501         }
4502 
4503         /*
4504          * Insert into active list.  At this point there are no 'hold's
4505          * on the zone, but everyone else knows not to use it, so we can
4506          * continue to use it.  zsched() will do a zone_hold() if the
4507          * newproc() is successful.
4508          */
4509         list_insert_tail(&zone_active, zone);
4510         mutex_exit(&zonehash_lock);
4511 
4512         zarg.zone = zone;
4513         zarg.nvlist = rctls;
4514         /*
4515          * The process, task, and project rctls are probably wrong;
4516          * we need an interface to get the default values of all rctls,
4517          * and initialize zsched appropriately.  I'm not sure that that
4518          * makes much of a difference, though.
4519          */
4520         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4521         if (error != 0) {
4522                 /*
4523                  * We need to undo all globally visible state.
4524                  */
4525                 mutex_enter(&zonehash_lock);
4526                 list_remove(&zone_active, zone);
4527                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4528                         ASSERT(zone->zone_slabel != NULL);
4529                         (void) mod_hash_destroy(zonehashbylabel,
4530                             (mod_hash_key_t)zone->zone_slabel);
4531                 }
4532                 (void) mod_hash_destroy(zonehashbyname,
4533                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4534                 (void) mod_hash_destroy(zonehashbyid,
4535                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4536                 ASSERT(zonecount > 1);
4537                 zonecount--;
4538                 goto errout;
4539         }
4540 
4541         /*
4542          * Zone creation can't fail from now on.
4543          */
4544 
4545         /*
4546          * Create zone kstats
4547          */
4548         zone_kstat_create(zone);
4549 
4550         /*
4551          * Let the other lwps continue.
4552          */
4553         mutex_enter(&pp->p_lock);
4554         if (curthread != pp->p_agenttp)
4555                 continuelwps(pp);
4556         mutex_exit(&pp->p_lock);
4557 
4558         /*
4559          * Wait for zsched to finish initializing the zone.
4560          */
4561         zone_status_wait(zone, ZONE_IS_READY);
4562         /*
4563          * The zone is fully visible, so we can let mounts progress.
4564          */
4565         resume_mounts(zone);
4566         if (rctls)
4567                 nvlist_free(rctls);
4568 
4569         return (zoneid);
4570 
4571 errout:
4572         mutex_exit(&zonehash_lock);
4573         /*
4574          * Let the other lwps continue.
4575          */
4576         mutex_enter(&pp->p_lock);
4577         if (curthread != pp->p_agenttp)
4578                 continuelwps(pp);
4579         mutex_exit(&pp->p_lock);
4580 
4581         resume_mounts(zone);
4582         if (rctls)
4583                 nvlist_free(rctls);
4584         /*
4585          * There is currently one reference to the zone, a cred_ref from
4586          * zone_kcred.  To free the zone, we call crfree, which will call
4587          * zone_cred_rele, which will call zone_free.
4588          */
4589         ASSERT(zone->zone_cred_ref == 1);
4590         ASSERT(zone->zone_kcred->cr_ref == 1);
4591         ASSERT(zone->zone_ref == 0);
4592         zkcr = zone->zone_kcred;
4593         zone->zone_kcred = NULL;
4594         crfree(zkcr);                           /* triggers call to zone_free */
4595         return (zone_create_error(error, error2, extended_error));
4596 }
4597 
4598 /*
4599  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4600  * the heavy lifting.  initname is the path to the program to launch
4601  * at the "top" of the zone; if this is NULL, we use the system default,
4602  * which is stored at zone_default_initname.
4603  */
4604 static int
4605 zone_boot(zoneid_t zoneid)
4606 {
4607         int err;
4608         zone_t *zone;
4609 
4610         if (secpolicy_zone_config(CRED()) != 0)
4611                 return (set_errno(EPERM));
4612         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4613                 return (set_errno(EINVAL));
4614 
4615         mutex_enter(&zonehash_lock);
4616         /*
4617          * Look for zone under hash lock to prevent races with calls to
4618          * zone_shutdown, zone_destroy, etc.
4619          */
4620         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4621                 mutex_exit(&zonehash_lock);
4622                 return (set_errno(EINVAL));
4623         }
4624 
4625         mutex_enter(&zone_status_lock);
4626         if (zone_status_get(zone) != ZONE_IS_READY) {
4627                 mutex_exit(&zone_status_lock);
4628                 mutex_exit(&zonehash_lock);
4629                 return (set_errno(EINVAL));
4630         }
4631         zone_status_set(zone, ZONE_IS_BOOTING);
4632         mutex_exit(&zone_status_lock);
4633 
4634         zone_hold(zone);        /* so we can use the zone_t later */
4635         mutex_exit(&zonehash_lock);
4636 
4637         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4638                 zone_rele(zone);
4639                 return (set_errno(EINTR));
4640         }
4641 
4642         /*
4643          * Boot (starting init) might have failed, in which case the zone
4644          * will go to the SHUTTING_DOWN state; an appropriate errno will
4645          * be placed in zone->zone_boot_err, and so we return that.
4646          */
4647         err = zone->zone_boot_err;
4648         zone_rele(zone);
4649         return (err ? set_errno(err) : 0);
4650 }
4651 
4652 /*
4653  * Kills all user processes in the zone, waiting for them all to exit
4654  * before returning.
4655  */
4656 static int
4657 zone_empty(zone_t *zone)
4658 {
4659         int waitstatus;
4660 
4661         /*
4662          * We need to drop zonehash_lock before killing all
4663          * processes, otherwise we'll deadlock with zone_find_*
4664          * which can be called from the exit path.
4665          */
4666         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4667         while ((waitstatus = zone_status_timedwait_sig(zone,
4668             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4669                 killall(zone->zone_id);
4670         }
4671         /*
4672          * return EINTR if we were signaled
4673          */
4674         if (waitstatus == 0)
4675                 return (EINTR);
4676         return (0);
4677 }
4678 
4679 /*
4680  * This function implements the policy for zone visibility.
4681  *
4682  * In standard Solaris, a non-global zone can only see itself.
4683  *
4684  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4685  * it dominates. For this test, the label of the global zone is treated as
4686  * admin_high so it is special-cased instead of being checked for dominance.
4687  *
4688  * Returns true if zone attributes are viewable, false otherwise.
4689  */
4690 static boolean_t
4691 zone_list_access(zone_t *zone)
4692 {
4693 
4694         if (curproc->p_zone == global_zone ||
4695             curproc->p_zone == zone) {
4696                 return (B_TRUE);
4697         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4698                 bslabel_t *curproc_label;
4699                 bslabel_t *zone_label;
4700 
4701                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4702                 zone_label = label2bslabel(zone->zone_slabel);
4703 
4704                 if (zone->zone_id != GLOBAL_ZONEID &&
4705                     bldominates(curproc_label, zone_label)) {
4706                         return (B_TRUE);
4707                 } else {
4708                         return (B_FALSE);
4709                 }
4710         } else {
4711                 return (B_FALSE);
4712         }
4713 }
4714 
4715 /*
4716  * Systemcall to start the zone's halt sequence.  By the time this
4717  * function successfully returns, all user processes and kernel threads
4718  * executing in it will have exited, ZSD shutdown callbacks executed,
4719  * and the zone status set to ZONE_IS_DOWN.
4720  *
4721  * It is possible that the call will interrupt itself if the caller is the
4722  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4723  */
4724 static int
4725 zone_shutdown(zoneid_t zoneid)
4726 {
4727         int error;
4728         zone_t *zone;
4729         zone_status_t status;
4730 
4731         if (secpolicy_zone_config(CRED()) != 0)
4732                 return (set_errno(EPERM));
4733         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4734                 return (set_errno(EINVAL));
4735 
4736         mutex_enter(&zonehash_lock);
4737         /*
4738          * Look for zone under hash lock to prevent races with other
4739          * calls to zone_shutdown and zone_destroy.
4740          */
4741         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4742                 mutex_exit(&zonehash_lock);
4743                 return (set_errno(EINVAL));
4744         }
4745 
4746         /*
4747          * We have to drop zonehash_lock before calling block_mounts.
4748          * Hold the zone so we can continue to use the zone_t.
4749          */
4750         zone_hold(zone);
4751         mutex_exit(&zonehash_lock);
4752 
4753         /*
4754          * Block mounts so that VFS_MOUNT() can get an accurate view of
4755          * the zone's status with regards to ZONE_IS_SHUTTING down.
4756          *
4757          * e.g. NFS can fail the mount if it determines that the zone
4758          * has already begun the shutdown sequence.
4759          *
4760          */
4761         if (block_mounts(zone) == 0) {
4762                 zone_rele(zone);
4763                 return (set_errno(EINTR));
4764         }
4765 
4766         mutex_enter(&zonehash_lock);
4767         mutex_enter(&zone_status_lock);
4768         status = zone_status_get(zone);
4769         /*
4770          * Fail if the zone isn't fully initialized yet.
4771          */
4772         if (status < ZONE_IS_READY) {
4773                 mutex_exit(&zone_status_lock);
4774                 mutex_exit(&zonehash_lock);
4775                 resume_mounts(zone);
4776                 zone_rele(zone);
4777                 return (set_errno(EINVAL));
4778         }
4779         /*
4780          * If conditions required for zone_shutdown() to return have been met,
4781          * return success.
4782          */
4783         if (status >= ZONE_IS_DOWN) {
4784                 mutex_exit(&zone_status_lock);
4785                 mutex_exit(&zonehash_lock);
4786                 resume_mounts(zone);
4787                 zone_rele(zone);
4788                 return (0);
4789         }
4790         /*
4791          * If zone_shutdown() hasn't been called before, go through the motions.
4792          * If it has, there's nothing to do but wait for the kernel threads to
4793          * drain.
4794          */
4795         if (status < ZONE_IS_EMPTY) {
4796                 uint_t ntasks;
4797 
4798                 mutex_enter(&zone->zone_lock);
4799                 if ((ntasks = zone->zone_ntasks) != 1) {
4800                         /*
4801                          * There's still stuff running.
4802                          */
4803                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4804                 }
4805                 mutex_exit(&zone->zone_lock);
4806                 if (ntasks == 1) {
4807                         /*
4808                          * The only way to create another task is through
4809                          * zone_enter(), which will block until we drop
4810                          * zonehash_lock.  The zone is empty.
4811                          */
4812                         if (zone->zone_kthreads == NULL) {
4813                                 /*
4814                                  * Skip ahead to ZONE_IS_DOWN
4815                                  */
4816                                 zone_status_set(zone, ZONE_IS_DOWN);
4817                         } else {
4818                                 zone_status_set(zone, ZONE_IS_EMPTY);
4819                         }
4820                 }
4821         }
4822         mutex_exit(&zone_status_lock);
4823         mutex_exit(&zonehash_lock);
4824         resume_mounts(zone);
4825 
4826         if (error = zone_empty(zone)) {
4827                 zone_rele(zone);
4828                 return (set_errno(error));
4829         }
4830         /*
4831          * After the zone status goes to ZONE_IS_DOWN this zone will no
4832          * longer be notified of changes to the pools configuration, so
4833          * in order to not end up with a stale pool pointer, we point
4834          * ourselves at the default pool and remove all resource
4835          * visibility.  This is especially important as the zone_t may
4836          * languish on the deathrow for a very long time waiting for
4837          * cred's to drain out.
4838          *
4839          * This rebinding of the zone can happen multiple times
4840          * (presumably due to interrupted or parallel systemcalls)
4841          * without any adverse effects.
4842          */
4843         if (pool_lock_intr() != 0) {
4844                 zone_rele(zone);
4845                 return (set_errno(EINTR));
4846         }
4847         if (pool_state == POOL_ENABLED) {
4848                 mutex_enter(&cpu_lock);
4849                 zone_pool_set(zone, pool_default);
4850                 /*
4851                  * The zone no longer needs to be able to see any cpus.
4852                  */
4853                 zone_pset_set(zone, ZONE_PS_INVAL);
4854                 mutex_exit(&cpu_lock);
4855         }
4856         pool_unlock();
4857 
4858         /*
4859          * ZSD shutdown callbacks can be executed multiple times, hence
4860          * it is safe to not be holding any locks across this call.
4861          */
4862         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4863 
4864         mutex_enter(&zone_status_lock);
4865         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4866                 zone_status_set(zone, ZONE_IS_DOWN);
4867         mutex_exit(&zone_status_lock);
4868 
4869         /*
4870          * Wait for kernel threads to drain.
4871          */
4872         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4873                 zone_rele(zone);
4874                 return (set_errno(EINTR));
4875         }
4876 
4877         /*
4878          * Zone can be become down/destroyable even if the above wait
4879          * returns EINTR, so any code added here may never execute.
4880          * (i.e. don't add code here)
4881          */
4882 
4883         zone_rele(zone);
4884         return (0);
4885 }
4886 
4887 /*
4888  * Log the specified zone's reference counts.  The caller should not be
4889  * holding the zone's zone_lock.
4890  */
4891 static void
4892 zone_log_refcounts(zone_t *zone)
4893 {
4894         char *buffer;
4895         char *buffer_position;
4896         uint32_t buffer_size;
4897         uint32_t index;
4898         uint_t ref;
4899         uint_t cred_ref;
4900 
4901         /*
4902          * Construct a string representing the subsystem-specific reference
4903          * counts.  The counts are printed in ascending order by index into the
4904          * zone_t::zone_subsys_ref array.  The list will be surrounded by
4905          * square brackets [] and will only contain nonzero reference counts.
4906          *
4907          * The buffer will hold two square bracket characters plus ten digits,
4908          * one colon, one space, one comma, and some characters for a
4909          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4910          * bit integers have at most ten decimal digits.)  The last
4911          * reference count's comma is replaced by the closing square
4912          * bracket and a NULL character to terminate the string.
4913          *
4914          * NOTE: We have to grab the zone's zone_lock to create a consistent
4915          * snapshot of the zone's reference counters.
4916          *
4917          * First, figure out how much space the string buffer will need.
4918          * The buffer's size is stored in buffer_size.
4919          */
4920         buffer_size = 2;                        /* for the square brackets */
4921         mutex_enter(&zone->zone_lock);
4922         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4923         ref = zone->zone_ref;
4924         cred_ref = zone->zone_cred_ref;
4925         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4926                 if (zone->zone_subsys_ref[index] != 0)
4927                         buffer_size += strlen(zone_ref_subsys_names[index]) +
4928                             13;
4929         if (buffer_size == 2) {
4930                 /*
4931                  * No subsystems had nonzero reference counts.  Don't bother
4932                  * with allocating a buffer; just log the general-purpose and
4933                  * credential reference counts.
4934                  */
4935                 mutex_exit(&zone->zone_lock);
4936                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4937                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
4938                     "references and %u credential references are still extant",
4939                     zone->zone_name, zone->zone_id, ref, cred_ref);
4940                 return;
4941         }
4942 
4943         /*
4944          * buffer_size contains the exact number of characters that the
4945          * buffer will need.  Allocate the buffer and fill it with nonzero
4946          * subsystem-specific reference counts.  Surround the results with
4947          * square brackets afterwards.
4948          */
4949         buffer = kmem_alloc(buffer_size, KM_SLEEP);
4950         buffer_position = &buffer[1];
4951         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4952                 /*
4953                  * NOTE: The DDI's version of sprintf() returns a pointer to
4954                  * the modified buffer rather than the number of bytes written
4955                  * (as in snprintf(3C)).  This is unfortunate and annoying.
4956                  * Therefore, we'll use snprintf() with INT_MAX to get the
4957                  * number of bytes written.  Using INT_MAX is safe because
4958                  * the buffer is perfectly sized for the data: we'll never
4959                  * overrun the buffer.
4960                  */
4961                 if (zone->zone_subsys_ref[index] != 0)
4962                         buffer_position += snprintf(buffer_position, INT_MAX,
4963                             "%s: %u,", zone_ref_subsys_names[index],
4964                             zone->zone_subsys_ref[index]);
4965         }
4966         mutex_exit(&zone->zone_lock);
4967         buffer[0] = '[';
4968         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4969         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4970         buffer_position[-1] = ']';
4971 
4972         /*
4973          * Log the reference counts and free the message buffer.
4974          */
4975         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4976             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4977             "%u credential references are still extant %s", zone->zone_name,
4978             zone->zone_id, ref, cred_ref, buffer);
4979         kmem_free(buffer, buffer_size);
4980 }
4981 
4982 /*
4983  * Systemcall entry point to finalize the zone halt process.  The caller
4984  * must have already successfully called zone_shutdown().
4985  *
4986  * Upon successful completion, the zone will have been fully destroyed:
4987  * zsched will have exited, destructor callbacks executed, and the zone
4988  * removed from the list of active zones.
4989  */
4990 static int
4991 zone_destroy(zoneid_t zoneid)
4992 {
4993         uint64_t uniqid;
4994         zone_t *zone;
4995         zone_status_t status;
4996         clock_t wait_time;
4997         boolean_t log_refcounts;
4998 
4999         if (secpolicy_zone_config(CRED()) != 0)
5000                 return (set_errno(EPERM));
5001         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5002                 return (set_errno(EINVAL));
5003 
5004         mutex_enter(&zonehash_lock);
5005         /*
5006          * Look for zone under hash lock to prevent races with other
5007          * calls to zone_destroy.
5008          */
5009         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5010                 mutex_exit(&zonehash_lock);
5011                 return (set_errno(EINVAL));
5012         }
5013 
5014         if (zone_mount_count(zone->zone_rootpath) != 0) {
5015                 mutex_exit(&zonehash_lock);
5016                 return (set_errno(EBUSY));
5017         }
5018         mutex_enter(&zone_status_lock);
5019         status = zone_status_get(zone);
5020         if (status < ZONE_IS_DOWN) {
5021                 mutex_exit(&zone_status_lock);
5022                 mutex_exit(&zonehash_lock);
5023                 return (set_errno(EBUSY));
5024         } else if (status == ZONE_IS_DOWN) {
5025                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5026         }
5027         mutex_exit(&zone_status_lock);
5028         zone_hold(zone);
5029         mutex_exit(&zonehash_lock);
5030 
5031         /*
5032          * wait for zsched to exit
5033          */
5034         zone_status_wait(zone, ZONE_IS_DEAD);
5035         zone_zsd_callbacks(zone, ZSD_DESTROY);
5036         zone->zone_netstack = NULL;
5037         uniqid = zone->zone_uniqid;
5038         zone_rele(zone);
5039         zone = NULL;    /* potentially free'd */
5040 
5041         log_refcounts = B_FALSE;
5042         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5043         mutex_enter(&zonehash_lock);
5044         for (; /* ever */; ) {
5045                 boolean_t unref;
5046                 boolean_t refs_have_been_logged;
5047 
5048                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5049                     zone->zone_uniqid != uniqid) {
5050                         /*
5051                          * The zone has gone away.  Necessary conditions
5052                          * are met, so we return success.
5053                          */
5054                         mutex_exit(&zonehash_lock);
5055                         return (0);
5056                 }
5057                 mutex_enter(&zone->zone_lock);
5058                 unref = ZONE_IS_UNREF(zone);
5059                 refs_have_been_logged = (zone->zone_flags &
5060                     ZF_REFCOUNTS_LOGGED);
5061                 mutex_exit(&zone->zone_lock);
5062                 if (unref) {
5063                         /*
5064                          * There is only one reference to the zone -- that
5065                          * added when the zone was added to the hashtables --
5066                          * and things will remain this way until we drop
5067                          * zonehash_lock... we can go ahead and cleanup the
5068                          * zone.
5069                          */
5070                         break;
5071                 }
5072 
5073                 /*
5074                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5075                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5076                  * some zone's general-purpose reference count reaches one.
5077                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5078                  * on zone_destroy_cv, then log the zone's reference counts and
5079                  * continue to wait for zone_rele() and zone_cred_rele().
5080                  */
5081                 if (!refs_have_been_logged) {
5082                         if (!log_refcounts) {
5083                                 /*
5084                                  * This thread hasn't timed out waiting on
5085                                  * zone_destroy_cv yet.  Wait wait_time clock
5086                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5087                                  * seconds) for the zone's references to clear.
5088                                  */
5089                                 ASSERT(wait_time > 0);
5090                                 wait_time = cv_reltimedwait_sig(
5091                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5092                                     TR_SEC);
5093                                 if (wait_time > 0) {
5094                                         /*
5095                                          * A thread in zone_rele() or
5096                                          * zone_cred_rele() signaled
5097                                          * zone_destroy_cv before this thread's
5098                                          * wait timed out.  The zone might have
5099                                          * only one reference left; find out!
5100                                          */
5101                                         continue;
5102                                 } else if (wait_time == 0) {
5103                                         /* The thread's process was signaled. */
5104                                         mutex_exit(&zonehash_lock);
5105                                         return (set_errno(EINTR));
5106                                 }
5107 
5108                                 /*
5109                                  * The thread timed out while waiting on
5110                                  * zone_destroy_cv.  Even though the thread
5111                                  * timed out, it has to check whether another
5112                                  * thread woke up from zone_destroy_cv and
5113                                  * destroyed the zone.
5114                                  *
5115                                  * If the zone still exists and has more than
5116                                  * one unreleased general-purpose reference,
5117                                  * then log the zone's reference counts.
5118                                  */
5119                                 log_refcounts = B_TRUE;
5120                                 continue;
5121                         }
5122 
5123                         /*
5124                          * The thread already timed out on zone_destroy_cv while
5125                          * waiting for subsystems to release the zone's last
5126                          * general-purpose references.  Log the zone's reference
5127                          * counts and wait indefinitely on zone_destroy_cv.
5128                          */
5129                         zone_log_refcounts(zone);
5130                 }
5131                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5132                         /* The thread's process was signaled. */
5133                         mutex_exit(&zonehash_lock);
5134                         return (set_errno(EINTR));
5135                 }
5136         }
5137 
5138         /*
5139          * Remove CPU cap for this zone now since we're not going to
5140          * fail below this point.
5141          */
5142         cpucaps_zone_remove(zone);
5143 
5144         /* Get rid of the zone's kstats */
5145         zone_kstat_delete(zone);
5146 
5147         /* remove the pfexecd doors */
5148         if (zone->zone_pfexecd != NULL) {
5149                 klpd_freelist(&zone->zone_pfexecd);
5150                 zone->zone_pfexecd = NULL;
5151         }
5152 
5153         /* free brand specific data */
5154         if (ZONE_IS_BRANDED(zone))
5155                 ZBROP(zone)->b_free_brand_data(zone);
5156 
5157         /* Say goodbye to brand framework. */
5158         brand_unregister_zone(zone->zone_brand);
5159 
5160         /*
5161          * It is now safe to let the zone be recreated; remove it from the
5162          * lists.  The memory will not be freed until the last cred
5163          * reference goes away.
5164          */
5165         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5166         zonecount--;
5167         /* remove from active list and hash tables */
5168         list_remove(&zone_active, zone);
5169         (void) mod_hash_destroy(zonehashbyname,
5170             (mod_hash_key_t)zone->zone_name);
5171         (void) mod_hash_destroy(zonehashbyid,
5172             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5173         if (zone->zone_flags & ZF_HASHED_LABEL)
5174                 (void) mod_hash_destroy(zonehashbylabel,
5175                     (mod_hash_key_t)zone->zone_slabel);
5176         mutex_exit(&zonehash_lock);
5177 
5178         /*
5179          * Release the root vnode; we're not using it anymore.  Nor should any
5180          * other thread that might access it exist.
5181          */
5182         if (zone->zone_rootvp != NULL) {
5183                 VN_RELE(zone->zone_rootvp);
5184                 zone->zone_rootvp = NULL;
5185         }
5186 
5187         /* add to deathrow list */
5188         mutex_enter(&zone_deathrow_lock);
5189         list_insert_tail(&zone_deathrow, zone);
5190         mutex_exit(&zone_deathrow_lock);
5191 
5192         /*
5193          * Drop last reference (which was added by zsched()), this will
5194          * free the zone unless there are outstanding cred references.
5195          */
5196         zone_rele(zone);
5197         return (0);
5198 }
5199 
5200 /*
5201  * Systemcall entry point for zone_getattr(2).
5202  */
5203 static ssize_t
5204 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5205 {
5206         size_t size;
5207         int error = 0, err;
5208         zone_t *zone;
5209         char *zonepath;
5210         char *outstr;
5211         zone_status_t zone_status;
5212         pid_t initpid;
5213         boolean_t global = (curzone == global_zone);
5214         boolean_t inzone = (curzone->zone_id == zoneid);
5215         ushort_t flags;
5216         zone_net_data_t *zbuf;
5217 
5218         mutex_enter(&zonehash_lock);
5219         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5220                 mutex_exit(&zonehash_lock);
5221                 return (set_errno(EINVAL));
5222         }
5223         zone_status = zone_status_get(zone);
5224         if (zone_status < ZONE_IS_INITIALIZED) {
5225                 mutex_exit(&zonehash_lock);
5226                 return (set_errno(EINVAL));
5227         }
5228         zone_hold(zone);
5229         mutex_exit(&zonehash_lock);
5230 
5231         /*
5232          * If not in the global zone, don't show information about other zones,
5233          * unless the system is labeled and the local zone's label dominates
5234          * the other zone.
5235          */
5236         if (!zone_list_access(zone)) {
5237                 zone_rele(zone);
5238                 return (set_errno(EINVAL));
5239         }
5240 
5241         switch (attr) {
5242         case ZONE_ATTR_ROOT:
5243                 if (global) {
5244                         /*
5245                          * Copy the path to trim the trailing "/" (except for
5246                          * the global zone).
5247                          */
5248                         if (zone != global_zone)
5249                                 size = zone->zone_rootpathlen - 1;
5250                         else
5251                                 size = zone->zone_rootpathlen;
5252                         zonepath = kmem_alloc(size, KM_SLEEP);
5253                         bcopy(zone->zone_rootpath, zonepath, size);
5254                         zonepath[size - 1] = '\0';
5255                 } else {
5256                         if (inzone || !is_system_labeled()) {
5257                                 /*
5258                                  * Caller is not in the global zone.
5259                                  * if the query is on the current zone
5260                                  * or the system is not labeled,
5261                                  * just return faked-up path for current zone.
5262                                  */
5263                                 zonepath = "/";
5264                                 size = 2;
5265                         } else {
5266                                 /*
5267                                  * Return related path for current zone.
5268                                  */
5269                                 int prefix_len = strlen(zone_prefix);
5270                                 int zname_len = strlen(zone->zone_name);
5271 
5272                                 size = prefix_len + zname_len + 1;
5273                                 zonepath = kmem_alloc(size, KM_SLEEP);
5274                                 bcopy(zone_prefix, zonepath, prefix_len);
5275                                 bcopy(zone->zone_name, zonepath +
5276                                     prefix_len, zname_len);
5277                                 zonepath[size - 1] = '\0';
5278                         }
5279                 }
5280                 if (bufsize > size)
5281                         bufsize = size;
5282                 if (buf != NULL) {
5283                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5284                         if (err != 0 && err != ENAMETOOLONG)
5285                                 error = EFAULT;
5286                 }
5287                 if (global || (is_system_labeled() && !inzone))
5288                         kmem_free(zonepath, size);
5289                 break;
5290 
5291         case ZONE_ATTR_NAME:
5292                 size = strlen(zone->zone_name) + 1;
5293                 if (bufsize > size)
5294                         bufsize = size;
5295                 if (buf != NULL) {
5296                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5297                         if (err != 0 && err != ENAMETOOLONG)
5298                                 error = EFAULT;
5299                 }
5300                 break;
5301 
5302         case ZONE_ATTR_STATUS:
5303                 /*
5304                  * Since we're not holding zonehash_lock, the zone status
5305                  * may be anything; leave it up to userland to sort it out.
5306                  */
5307                 size = sizeof (zone_status);
5308                 if (bufsize > size)
5309                         bufsize = size;
5310                 zone_status = zone_status_get(zone);
5311                 if (buf != NULL &&
5312                     copyout(&zone_status, buf, bufsize) != 0)
5313                         error = EFAULT;
5314                 break;
5315         case ZONE_ATTR_FLAGS:
5316                 size = sizeof (zone->zone_flags);
5317                 if (bufsize > size)
5318                         bufsize = size;
5319                 flags = zone->zone_flags;
5320                 if (buf != NULL &&
5321                     copyout(&flags, buf, bufsize) != 0)
5322                         error = EFAULT;
5323                 break;
5324         case ZONE_ATTR_PRIVSET:
5325                 size = sizeof (priv_set_t);
5326                 if (bufsize > size)
5327                         bufsize = size;
5328                 if (buf != NULL &&
5329                     copyout(zone->zone_privset, buf, bufsize) != 0)
5330                         error = EFAULT;
5331                 break;
5332         case ZONE_ATTR_UNIQID:
5333                 size = sizeof (zone->zone_uniqid);
5334                 if (bufsize > size)
5335                         bufsize = size;
5336                 if (buf != NULL &&
5337                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5338                         error = EFAULT;
5339                 break;
5340         case ZONE_ATTR_POOLID:
5341                 {
5342                         pool_t *pool;
5343                         poolid_t poolid;
5344 
5345                         if (pool_lock_intr() != 0) {
5346                                 error = EINTR;
5347                                 break;
5348                         }
5349                         pool = zone_pool_get(zone);
5350                         poolid = pool->pool_id;
5351                         pool_unlock();
5352                         size = sizeof (poolid);
5353                         if (bufsize > size)
5354                                 bufsize = size;
5355                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5356                                 error = EFAULT;
5357                 }
5358                 break;
5359         case ZONE_ATTR_SLBL:
5360                 size = sizeof (bslabel_t);
5361                 if (bufsize > size)
5362                         bufsize = size;
5363                 if (zone->zone_slabel == NULL)
5364                         error = EINVAL;
5365                 else if (buf != NULL &&
5366                     copyout(label2bslabel(zone->zone_slabel), buf,
5367                     bufsize) != 0)
5368                         error = EFAULT;
5369                 break;
5370         case ZONE_ATTR_INITPID:
5371                 size = sizeof (initpid);
5372                 if (bufsize > size)
5373                         bufsize = size;
5374                 initpid = zone->zone_proc_initpid;
5375                 if (initpid == -1) {
5376                         error = ESRCH;
5377                         break;
5378                 }
5379                 if (buf != NULL &&
5380                     copyout(&initpid, buf, bufsize) != 0)
5381                         error = EFAULT;
5382                 break;
5383         case ZONE_ATTR_BRAND:
5384                 size = strlen(zone->zone_brand->b_name) + 1;
5385 
5386                 if (bufsize > size)
5387                         bufsize = size;
5388                 if (buf != NULL) {
5389                         err = copyoutstr(zone->zone_brand->b_name, buf,
5390                             bufsize, NULL);
5391                         if (err != 0 && err != ENAMETOOLONG)
5392                                 error = EFAULT;
5393                 }
5394                 break;
5395         case ZONE_ATTR_INITNAME:
5396                 size = strlen(zone->zone_initname) + 1;
5397                 if (bufsize > size)
5398                         bufsize = size;
5399                 if (buf != NULL) {
5400                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5401                             NULL);
5402                         if (err != 0 && err != ENAMETOOLONG)
5403                                 error = EFAULT;
5404                 }
5405                 break;
5406         case ZONE_ATTR_BOOTARGS:
5407                 if (zone->zone_bootargs == NULL)
5408                         outstr = "";
5409                 else
5410                         outstr = zone->zone_bootargs;
5411                 size = strlen(outstr) + 1;
5412                 if (bufsize > size)
5413                         bufsize = size;
5414                 if (buf != NULL) {
5415                         err = copyoutstr(outstr, buf, bufsize, NULL);
5416                         if (err != 0 && err != ENAMETOOLONG)
5417                                 error = EFAULT;
5418                 }
5419                 break;
5420         case ZONE_ATTR_PHYS_MCAP:
5421                 size = sizeof (zone->zone_phys_mcap);
5422                 if (bufsize > size)
5423                         bufsize = size;
5424                 if (buf != NULL &&
5425                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5426                         error = EFAULT;
5427                 break;
5428         case ZONE_ATTR_SCHED_CLASS:
5429                 mutex_enter(&class_lock);
5430 
5431                 if (zone->zone_defaultcid >= loaded_classes)
5432                         outstr = "";
5433                 else
5434                         outstr = sclass[zone->zone_defaultcid].cl_name;
5435                 size = strlen(outstr) + 1;
5436                 if (bufsize > size)
5437                         bufsize = size;
5438                 if (buf != NULL) {
5439                         err = copyoutstr(outstr, buf, bufsize, NULL);
5440                         if (err != 0 && err != ENAMETOOLONG)
5441                                 error = EFAULT;
5442                 }
5443 
5444                 mutex_exit(&class_lock);
5445                 break;
5446         case ZONE_ATTR_HOSTID:
5447                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5448                     bufsize == sizeof (zone->zone_hostid)) {
5449                         size = sizeof (zone->zone_hostid);
5450                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5451                             bufsize) != 0)
5452                                 error = EFAULT;
5453                 } else {
5454                         error = EINVAL;
5455                 }
5456                 break;
5457         case ZONE_ATTR_FS_ALLOWED:
5458                 if (zone->zone_fs_allowed == NULL)
5459                         outstr = "";
5460                 else
5461                         outstr = zone->zone_fs_allowed;
5462                 size = strlen(outstr) + 1;
5463                 if (bufsize > size)
5464                         bufsize = size;
5465                 if (buf != NULL) {
5466                         err = copyoutstr(outstr, buf, bufsize, NULL);
5467                         if (err != 0 && err != ENAMETOOLONG)
5468                                 error = EFAULT;
5469                 }
5470                 break;
5471         case ZONE_ATTR_NETWORK:
5472                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5473                 if (copyin(buf, zbuf, bufsize) != 0) {
5474                         error = EFAULT;
5475                 } else {
5476                         error = zone_get_network(zoneid, zbuf);
5477                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5478                                 error = EFAULT;
5479                 }
5480                 kmem_free(zbuf, bufsize);
5481                 break;
5482         default:
5483                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5484                         size = bufsize;
5485                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5486                 } else {
5487                         error = EINVAL;
5488                 }
5489         }
5490         zone_rele(zone);
5491 
5492         if (error)
5493                 return (set_errno(error));
5494         return ((ssize_t)size);
5495 }
5496 
5497 /*
5498  * Systemcall entry point for zone_setattr(2).
5499  */
5500 /*ARGSUSED*/
5501 static int
5502 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5503 {
5504         zone_t *zone;
5505         zone_status_t zone_status;
5506         int err = -1;
5507         zone_net_data_t *zbuf;
5508 
5509         if (secpolicy_zone_config(CRED()) != 0)
5510                 return (set_errno(EPERM));
5511 
5512         /*
5513          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5514          * global zone.
5515          */
5516         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5517                 return (set_errno(EINVAL));
5518         }
5519 
5520         mutex_enter(&zonehash_lock);
5521         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5522                 mutex_exit(&zonehash_lock);
5523                 return (set_errno(EINVAL));
5524         }
5525         zone_hold(zone);
5526         mutex_exit(&zonehash_lock);
5527 
5528         /*
5529          * At present most attributes can only be set on non-running,
5530          * non-global zones.
5531          */
5532         zone_status = zone_status_get(zone);
5533         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5534                 err = EINVAL;
5535                 goto done;
5536         }
5537 
5538         switch (attr) {
5539         case ZONE_ATTR_INITNAME:
5540                 err = zone_set_initname(zone, (const char *)buf);
5541                 break;
5542         case ZONE_ATTR_INITNORESTART:
5543                 zone->zone_restart_init = B_FALSE;
5544                 err = 0;
5545                 break;
5546         case ZONE_ATTR_BOOTARGS:
5547                 err = zone_set_bootargs(zone, (const char *)buf);
5548                 break;
5549         case ZONE_ATTR_BRAND:
5550                 err = zone_set_brand(zone, (const char *)buf);
5551                 break;
5552         case ZONE_ATTR_FS_ALLOWED:
5553                 err = zone_set_fs_allowed(zone, (const char *)buf);
5554                 break;
5555         case ZONE_ATTR_PHYS_MCAP:
5556                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5557                 break;
5558         case ZONE_ATTR_SCHED_CLASS:
5559                 err = zone_set_sched_class(zone, (const char *)buf);
5560                 break;
5561         case ZONE_ATTR_HOSTID:
5562                 if (bufsize == sizeof (zone->zone_hostid)) {
5563                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5564                                 err = 0;
5565                         else
5566                                 err = EFAULT;
5567                 } else {
5568                         err = EINVAL;
5569                 }
5570                 break;
5571         case ZONE_ATTR_NETWORK:
5572                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5573                         err = EINVAL;
5574                         break;
5575                 }
5576                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5577                 if (copyin(buf, zbuf, bufsize) != 0) {
5578                         kmem_free(zbuf, bufsize);
5579                         err = EFAULT;
5580                         break;
5581                 }
5582                 err = zone_set_network(zoneid, zbuf);
5583                 kmem_free(zbuf, bufsize);
5584                 break;
5585         default:
5586                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5587                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5588                 else
5589                         err = EINVAL;
5590         }
5591 
5592 done:
5593         zone_rele(zone);
5594         ASSERT(err != -1);
5595         return (err != 0 ? set_errno(err) : 0);
5596 }
5597 
5598 /*
5599  * Return zero if the process has at least one vnode mapped in to its
5600  * address space which shouldn't be allowed to change zones.
5601  *
5602  * Also return zero if the process has any shared mappings which reserve
5603  * swap.  This is because the counting for zone.max-swap does not allow swap
5604  * reservation to be shared between zones.  zone swap reservation is counted
5605  * on zone->zone_max_swap.
5606  */
5607 static int
5608 as_can_change_zones(void)
5609 {
5610         proc_t *pp = curproc;
5611         struct seg *seg;
5612         struct as *as = pp->p_as;
5613         vnode_t *vp;
5614         int allow = 1;
5615 
5616         ASSERT(pp->p_as != &kas);
5617         AS_LOCK_ENTER(as, RW_READER);
5618         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5619 
5620                 /*
5621                  * Cannot enter zone with shared anon memory which
5622                  * reserves swap.  See comment above.
5623                  */
5624                 if (seg_can_change_zones(seg) == B_FALSE) {
5625                         allow = 0;
5626                         break;
5627                 }
5628                 /*
5629                  * if we can't get a backing vnode for this segment then skip
5630                  * it.
5631                  */
5632                 vp = NULL;
5633                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5634                         continue;
5635                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5636                         allow = 0;
5637                         break;
5638                 }
5639         }
5640         AS_LOCK_EXIT(as);
5641         return (allow);
5642 }
5643 
5644 /*
5645  * Count swap reserved by curproc's address space
5646  */
5647 static size_t
5648 as_swresv(void)
5649 {
5650         proc_t *pp = curproc;
5651         struct seg *seg;
5652         struct as *as = pp->p_as;
5653         size_t swap = 0;
5654 
5655         ASSERT(pp->p_as != &kas);
5656         ASSERT(AS_WRITE_HELD(as));
5657         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5658                 swap += seg_swresv(seg);
5659 
5660         return (swap);
5661 }
5662 
5663 /*
5664  * Systemcall entry point for zone_enter().
5665  *
5666  * The current process is injected into said zone.  In the process
5667  * it will change its project membership, privileges, rootdir/cwd,
5668  * zone-wide rctls, and pool association to match those of the zone.
5669  *
5670  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5671  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5672  * enter a zone that is "ready" or "running".
5673  */
5674 static int
5675 zone_enter(zoneid_t zoneid)
5676 {
5677         zone_t *zone;
5678         vnode_t *vp;
5679         proc_t *pp = curproc;
5680         contract_t *ct;
5681         cont_process_t *ctp;
5682         task_t *tk, *oldtk;
5683         kproject_t *zone_proj0;
5684         cred_t *cr, *newcr;
5685         pool_t *oldpool, *newpool;
5686         sess_t *sp;
5687         uid_t uid;
5688         zone_status_t status;
5689         int err = 0;
5690         rctl_entity_p_t e;
5691         size_t swap;
5692         kthread_id_t t;
5693 
5694         if (secpolicy_zone_config(CRED()) != 0)
5695                 return (set_errno(EPERM));
5696         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5697                 return (set_errno(EINVAL));
5698 
5699         /*
5700          * Stop all lwps so we don't need to hold a lock to look at
5701          * curproc->p_zone.  This needs to happen before we grab any
5702          * locks to avoid deadlock (another lwp in the process could
5703          * be waiting for the held lock).
5704          */
5705         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5706                 return (set_errno(EINTR));
5707 
5708         /*
5709          * Make sure we're not changing zones with files open or mapped in
5710          * to our address space which shouldn't be changing zones.
5711          */
5712         if (!files_can_change_zones()) {
5713                 err = EBADF;
5714                 goto out;
5715         }
5716         if (!as_can_change_zones()) {
5717                 err = EFAULT;
5718                 goto out;
5719         }
5720 
5721         mutex_enter(&zonehash_lock);
5722         if (pp->p_zone != global_zone) {
5723                 mutex_exit(&zonehash_lock);
5724                 err = EINVAL;
5725                 goto out;
5726         }
5727 
5728         zone = zone_find_all_by_id(zoneid);
5729         if (zone == NULL) {
5730                 mutex_exit(&zonehash_lock);
5731                 err = EINVAL;
5732                 goto out;
5733         }
5734 
5735         /*
5736          * To prevent processes in a zone from holding contracts on
5737          * extrazonal resources, and to avoid process contract
5738          * memberships which span zones, contract holders and processes
5739          * which aren't the sole members of their encapsulating process
5740          * contracts are not allowed to zone_enter.
5741          */
5742         ctp = pp->p_ct_process;
5743         ct = &ctp->conp_contract;
5744         mutex_enter(&ct->ct_lock);
5745         mutex_enter(&pp->p_lock);
5746         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5747                 mutex_exit(&pp->p_lock);
5748                 mutex_exit(&ct->ct_lock);
5749                 mutex_exit(&zonehash_lock);
5750                 err = EINVAL;
5751                 goto out;
5752         }
5753 
5754         /*
5755          * Moreover, we don't allow processes whose encapsulating
5756          * process contracts have inherited extrazonal contracts.
5757          * While it would be easier to eliminate all process contracts
5758          * with inherited contracts, we need to be able to give a
5759          * restarted init (or other zone-penetrating process) its
5760          * predecessor's contracts.
5761          */
5762         if (ctp->conp_ninherited != 0) {
5763                 contract_t *next;
5764                 for (next = list_head(&ctp->conp_inherited); next;
5765                     next = list_next(&ctp->conp_inherited, next)) {
5766                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5767                                 mutex_exit(&pp->p_lock);
5768                                 mutex_exit(&ct->ct_lock);
5769                                 mutex_exit(&zonehash_lock);
5770                                 err = EINVAL;
5771                                 goto out;
5772                         }
5773                 }
5774         }
5775 
5776         mutex_exit(&pp->p_lock);
5777         mutex_exit(&ct->ct_lock);
5778 
5779         status = zone_status_get(zone);
5780         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5781                 /*
5782                  * Can't join
5783                  */
5784                 mutex_exit(&zonehash_lock);
5785                 err = EINVAL;
5786                 goto out;
5787         }
5788 
5789         /*
5790          * Make sure new priv set is within the permitted set for caller
5791          */
5792         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5793                 mutex_exit(&zonehash_lock);
5794                 err = EPERM;
5795                 goto out;
5796         }
5797         /*
5798          * We want to momentarily drop zonehash_lock while we optimistically
5799          * bind curproc to the pool it should be running in.  This is safe
5800          * since the zone can't disappear (we have a hold on it).
5801          */
5802         zone_hold(zone);
5803         mutex_exit(&zonehash_lock);
5804 
5805         /*
5806          * Grab pool_lock to keep the pools configuration from changing
5807          * and to stop ourselves from getting rebound to another pool
5808          * until we join the zone.
5809          */
5810         if (pool_lock_intr() != 0) {
5811                 zone_rele(zone);
5812                 err = EINTR;
5813                 goto out;
5814         }
5815         ASSERT(secpolicy_pool(CRED()) == 0);
5816         /*
5817          * Bind ourselves to the pool currently associated with the zone.
5818          */
5819         oldpool = curproc->p_pool;
5820         newpool = zone_pool_get(zone);
5821         if (pool_state == POOL_ENABLED && newpool != oldpool &&
5822             (err = pool_do_bind(newpool, P_PID, P_MYID,
5823             POOL_BIND_ALL)) != 0) {
5824                 pool_unlock();
5825                 zone_rele(zone);
5826                 goto out;
5827         }
5828 
5829         /*
5830          * Grab cpu_lock now; we'll need it later when we call
5831          * task_join().
5832          */
5833         mutex_enter(&cpu_lock);
5834         mutex_enter(&zonehash_lock);
5835         /*
5836          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5837          */
5838         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5839                 /*
5840                  * Can't join anymore.
5841                  */
5842                 mutex_exit(&zonehash_lock);
5843                 mutex_exit(&cpu_lock);
5844                 if (pool_state == POOL_ENABLED &&
5845                     newpool != oldpool)
5846                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
5847                             POOL_BIND_ALL);
5848                 pool_unlock();
5849                 zone_rele(zone);
5850                 err = EINVAL;
5851                 goto out;
5852         }
5853 
5854         /*
5855          * a_lock must be held while transfering locked memory and swap
5856          * reservation from the global zone to the non global zone because
5857          * asynchronous faults on the processes' address space can lock
5858          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5859          * segments respectively.
5860          */
5861         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
5862         swap = as_swresv();
5863         mutex_enter(&pp->p_lock);
5864         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5865         /* verify that we do not exceed and task or lwp limits */
5866         mutex_enter(&zone->zone_nlwps_lock);
5867         /* add new lwps to zone and zone's proj0 */
5868         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5869         zone->zone_nlwps += pp->p_lwpcnt;
5870         /* add 1 task to zone's proj0 */
5871         zone_proj0->kpj_ntasks += 1;
5872 
5873         zone_proj0->kpj_nprocs++;
5874         zone->zone_nprocs++;
5875         mutex_exit(&zone->zone_nlwps_lock);
5876 
5877         mutex_enter(&zone->zone_mem_lock);
5878         zone->zone_locked_mem += pp->p_locked_mem;
5879         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5880         zone->zone_max_swap += swap;
5881         mutex_exit(&zone->zone_mem_lock);
5882 
5883         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5884         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5885         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5886 
5887         /* remove lwps and process from proc's old zone and old project */
5888         mutex_enter(&pp->p_zone->zone_nlwps_lock);
5889         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5890         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5891         pp->p_task->tk_proj->kpj_nprocs--;
5892         pp->p_zone->zone_nprocs--;
5893         mutex_exit(&pp->p_zone->zone_nlwps_lock);
5894 
5895         mutex_enter(&pp->p_zone->zone_mem_lock);
5896         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5897         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5898         pp->p_zone->zone_max_swap -= swap;
5899         mutex_exit(&pp->p_zone->zone_mem_lock);
5900 
5901         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5902         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5903         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5904 
5905         pp->p_flag |= SZONETOP;
5906         pp->p_zone = zone;
5907         mutex_exit(&pp->p_lock);
5908         AS_LOCK_EXIT(pp->p_as);
5909 
5910         /*
5911          * Joining the zone cannot fail from now on.
5912          *
5913          * This means that a lot of the following code can be commonized and
5914          * shared with zsched().
5915          */
5916 
5917         /*
5918          * If the process contract fmri was inherited, we need to
5919          * flag this so that any contract status will not leak
5920          * extra zone information, svc_fmri in this case
5921          */
5922         if (ctp->conp_svc_ctid != ct->ct_id) {
5923                 mutex_enter(&ct->ct_lock);
5924                 ctp->conp_svc_zone_enter = ct->ct_id;
5925                 mutex_exit(&ct->ct_lock);
5926         }
5927 
5928         /*
5929          * Reset the encapsulating process contract's zone.
5930          */
5931         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5932         contract_setzuniqid(ct, zone->zone_uniqid);
5933 
5934         /*
5935          * Create a new task and associate the process with the project keyed
5936          * by (projid,zoneid).
5937          *
5938          * We might as well be in project 0; the global zone's projid doesn't
5939          * make much sense in a zone anyhow.
5940          *
5941          * This also increments zone_ntasks, and returns with p_lock held.
5942          */
5943         tk = task_create(0, zone);
5944         oldtk = task_join(tk, 0);
5945         mutex_exit(&cpu_lock);
5946 
5947         /*
5948          * call RCTLOP_SET functions on this proc
5949          */
5950         e.rcep_p.zone = zone;
5951         e.rcep_t = RCENTITY_ZONE;
5952         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5953             RCD_CALLBACK);
5954         mutex_exit(&pp->p_lock);
5955 
5956         /*
5957          * We don't need to hold any of zsched's locks here; not only do we know
5958          * the process and zone aren't going away, we know its session isn't
5959          * changing either.
5960          *
5961          * By joining zsched's session here, we mimic the behavior in the
5962          * global zone of init's sid being the pid of sched.  We extend this
5963          * to all zlogin-like zone_enter()'ing processes as well.
5964          */
5965         mutex_enter(&pidlock);
5966         sp = zone->zone_zsched->p_sessp;
5967         sess_hold(zone->zone_zsched);
5968         mutex_enter(&pp->p_lock);
5969         pgexit(pp);
5970         sess_rele(pp->p_sessp, B_TRUE);
5971         pp->p_sessp = sp;
5972         pgjoin(pp, zone->zone_zsched->p_pidp);
5973 
5974         /*
5975          * If any threads are scheduled to be placed on zone wait queue they
5976          * should abandon the idea since the wait queue is changing.
5977          * We need to be holding pidlock & p_lock to do this.
5978          */
5979         if ((t = pp->p_tlist) != NULL) {
5980                 do {
5981                         thread_lock(t);
5982                         /*
5983                          * Kick this thread so that he doesn't sit
5984                          * on a wrong wait queue.
5985                          */
5986                         if (ISWAITING(t))
5987                                 setrun_locked(t);
5988 
5989                         if (t->t_schedflag & TS_ANYWAITQ)
5990                                 t->t_schedflag &= ~ TS_ANYWAITQ;
5991 
5992                         thread_unlock(t);
5993                 } while ((t = t->t_forw) != pp->p_tlist);
5994         }
5995 
5996         /*
5997          * If there is a default scheduling class for the zone and it is not
5998          * the class we are currently in, change all of the threads in the
5999          * process to the new class.  We need to be holding pidlock & p_lock
6000          * when we call parmsset so this is a good place to do it.
6001          */
6002         if (zone->zone_defaultcid > 0 &&
6003             zone->zone_defaultcid != curthread->t_cid) {
6004                 pcparms_t pcparms;
6005 
6006                 pcparms.pc_cid = zone->zone_defaultcid;
6007                 pcparms.pc_clparms[0] = 0;
6008 
6009                 /*
6010                  * If setting the class fails, we still want to enter the zone.
6011                  */
6012                 if ((t = pp->p_tlist) != NULL) {
6013                         do {
6014                                 (void) parmsset(&pcparms, t);
6015                         } while ((t = t->t_forw) != pp->p_tlist);
6016                 }
6017         }
6018 
6019         mutex_exit(&pp->p_lock);
6020         mutex_exit(&pidlock);
6021 
6022         mutex_exit(&zonehash_lock);
6023         /*
6024          * We're firmly in the zone; let pools progress.
6025          */
6026         pool_unlock();
6027         task_rele(oldtk);
6028         /*
6029          * We don't need to retain a hold on the zone since we already
6030          * incremented zone_ntasks, so the zone isn't going anywhere.
6031          */
6032         zone_rele(zone);
6033 
6034         /*
6035          * Chroot
6036          */
6037         vp = zone->zone_rootvp;
6038         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6039         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6040 
6041         /*
6042          * Change process credentials
6043          */
6044         newcr = cralloc();
6045         mutex_enter(&pp->p_crlock);
6046         cr = pp->p_cred;
6047         crcopy_to(cr, newcr);
6048         crsetzone(newcr, zone);
6049         pp->p_cred = newcr;
6050 
6051         /*
6052          * Restrict all process privilege sets to zone limit
6053          */
6054         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6055         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6056         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6057         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6058         mutex_exit(&pp->p_crlock);
6059         crset(pp, newcr);
6060 
6061         /*
6062          * Adjust upcount to reflect zone entry.
6063          */
6064         uid = crgetruid(newcr);
6065         mutex_enter(&pidlock);
6066         upcount_dec(uid, GLOBAL_ZONEID);
6067         upcount_inc(uid, zoneid);
6068         mutex_exit(&pidlock);
6069 
6070         /*
6071          * Set up core file path and content.
6072          */
6073         set_core_defaults();
6074 
6075 out:
6076         /*
6077          * Let the other lwps continue.
6078          */
6079         mutex_enter(&pp->p_lock);
6080         if (curthread != pp->p_agenttp)
6081                 continuelwps(pp);
6082         mutex_exit(&pp->p_lock);
6083 
6084         return (err != 0 ? set_errno(err) : 0);
6085 }
6086 
6087 /*
6088  * Systemcall entry point for zone_list(2).
6089  *
6090  * Processes running in a (non-global) zone only see themselves.
6091  * On labeled systems, they see all zones whose label they dominate.
6092  */
6093 static int
6094 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6095 {
6096         zoneid_t *zoneids;
6097         zone_t *zone, *myzone;
6098         uint_t user_nzones, real_nzones;
6099         uint_t domi_nzones;
6100         int error;
6101 
6102         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6103                 return (set_errno(EFAULT));
6104 
6105         myzone = curproc->p_zone;
6106         if (myzone != global_zone) {
6107                 bslabel_t *mybslab;
6108 
6109                 if (!is_system_labeled()) {
6110                         /* just return current zone */
6111                         real_nzones = domi_nzones = 1;
6112                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6113                         zoneids[0] = myzone->zone_id;
6114                 } else {
6115                         /* return all zones that are dominated */
6116                         mutex_enter(&zonehash_lock);
6117                         real_nzones = zonecount;
6118                         domi_nzones = 0;
6119                         if (real_nzones > 0) {
6120                                 zoneids = kmem_alloc(real_nzones *
6121                                     sizeof (zoneid_t), KM_SLEEP);
6122                                 mybslab = label2bslabel(myzone->zone_slabel);
6123                                 for (zone = list_head(&zone_active);
6124                                     zone != NULL;
6125                                     zone = list_next(&zone_active, zone)) {
6126                                         if (zone->zone_id == GLOBAL_ZONEID)
6127                                                 continue;
6128                                         if (zone != myzone &&
6129                                             (zone->zone_flags & ZF_IS_SCRATCH))
6130                                                 continue;
6131                                         /*
6132                                          * Note that a label always dominates
6133                                          * itself, so myzone is always included
6134                                          * in the list.
6135                                          */
6136                                         if (bldominates(mybslab,
6137                                             label2bslabel(zone->zone_slabel))) {
6138                                                 zoneids[domi_nzones++] =
6139                                                     zone->zone_id;
6140                                         }
6141                                 }
6142                         }
6143                         mutex_exit(&zonehash_lock);
6144                 }
6145         } else {
6146                 mutex_enter(&zonehash_lock);
6147                 real_nzones = zonecount;
6148                 domi_nzones = 0;
6149                 if (real_nzones > 0) {
6150                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6151                             KM_SLEEP);
6152                         for (zone = list_head(&zone_active); zone != NULL;
6153                             zone = list_next(&zone_active, zone))
6154                                 zoneids[domi_nzones++] = zone->zone_id;
6155                         ASSERT(domi_nzones == real_nzones);
6156                 }
6157                 mutex_exit(&zonehash_lock);
6158         }
6159 
6160         /*
6161          * If user has allocated space for fewer entries than we found, then
6162          * return only up to his limit.  Either way, tell him exactly how many
6163          * we found.
6164          */
6165         if (domi_nzones < user_nzones)
6166                 user_nzones = domi_nzones;
6167         error = 0;
6168         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6169                 error = EFAULT;
6170         } else if (zoneidlist != NULL && user_nzones != 0) {
6171                 if (copyout(zoneids, zoneidlist,
6172                     user_nzones * sizeof (zoneid_t)) != 0)
6173                         error = EFAULT;
6174         }
6175 
6176         if (real_nzones > 0)
6177                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6178 
6179         if (error != 0)
6180                 return (set_errno(error));
6181         else
6182                 return (0);
6183 }
6184 
6185 /*
6186  * Systemcall entry point for zone_lookup(2).
6187  *
6188  * Non-global zones are only able to see themselves and (on labeled systems)
6189  * the zones they dominate.
6190  */
6191 static zoneid_t
6192 zone_lookup(const char *zone_name)
6193 {
6194         char *kname;
6195         zone_t *zone;
6196         zoneid_t zoneid;
6197         int err;
6198 
6199         if (zone_name == NULL) {
6200                 /* return caller's zone id */
6201                 return (getzoneid());
6202         }
6203 
6204         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6205         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6206                 kmem_free(kname, ZONENAME_MAX);
6207                 return (set_errno(err));
6208         }
6209 
6210         mutex_enter(&zonehash_lock);
6211         zone = zone_find_all_by_name(kname);
6212         kmem_free(kname, ZONENAME_MAX);
6213         /*
6214          * In a non-global zone, can only lookup global and own name.
6215          * In Trusted Extensions zone label dominance rules apply.
6216          */
6217         if (zone == NULL ||
6218             zone_status_get(zone) < ZONE_IS_READY ||
6219             !zone_list_access(zone)) {
6220                 mutex_exit(&zonehash_lock);
6221                 return (set_errno(EINVAL));
6222         } else {
6223                 zoneid = zone->zone_id;
6224                 mutex_exit(&zonehash_lock);
6225                 return (zoneid);
6226         }
6227 }
6228 
6229 static int
6230 zone_version(int *version_arg)
6231 {
6232         int version = ZONE_SYSCALL_API_VERSION;
6233 
6234         if (copyout(&version, version_arg, sizeof (int)) != 0)
6235                 return (set_errno(EFAULT));
6236         return (0);
6237 }
6238 
6239 /* ARGSUSED */
6240 long
6241 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6242 {
6243         zone_def zs;
6244         int err;
6245 
6246         switch (cmd) {
6247         case ZONE_CREATE:
6248                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6249                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6250                                 return (set_errno(EFAULT));
6251                         }
6252                 } else {
6253 #ifdef _SYSCALL32_IMPL
6254                         zone_def32 zs32;
6255 
6256                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6257                                 return (set_errno(EFAULT));
6258                         }
6259                         zs.zone_name =
6260                             (const char *)(unsigned long)zs32.zone_name;
6261                         zs.zone_root =
6262                             (const char *)(unsigned long)zs32.zone_root;
6263                         zs.zone_privs =
6264                             (const struct priv_set *)
6265                             (unsigned long)zs32.zone_privs;
6266                         zs.zone_privssz = zs32.zone_privssz;
6267                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6268                         zs.rctlbufsz = zs32.rctlbufsz;
6269                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6270                         zs.zfsbufsz = zs32.zfsbufsz;
6271                         zs.extended_error =
6272                             (int *)(unsigned long)zs32.extended_error;
6273                         zs.match = zs32.match;
6274                         zs.doi = zs32.doi;
6275                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6276                         zs.flags = zs32.flags;
6277 #else
6278                         panic("get_udatamodel() returned bogus result\n");
6279 #endif
6280                 }
6281 
6282                 return (zone_create(zs.zone_name, zs.zone_root,
6283                     zs.zone_privs, zs.zone_privssz,
6284                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6285                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6286                     zs.extended_error, zs.match, zs.doi,
6287                     zs.label, zs.flags));
6288         case ZONE_BOOT:
6289                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6290         case ZONE_DESTROY:
6291                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6292         case ZONE_GETATTR:
6293                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6294                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6295         case ZONE_SETATTR:
6296                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6297                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6298         case ZONE_ENTER:
6299                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6300         case ZONE_LIST:
6301                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6302         case ZONE_SHUTDOWN:
6303                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6304         case ZONE_LOOKUP:
6305                 return (zone_lookup((const char *)arg1));
6306         case ZONE_VERSION:
6307                 return (zone_version((int *)arg1));
6308         case ZONE_ADD_DATALINK:
6309                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6310                     (datalink_id_t)(uintptr_t)arg2));
6311         case ZONE_DEL_DATALINK:
6312                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6313                     (datalink_id_t)(uintptr_t)arg2));
6314         case ZONE_CHECK_DATALINK: {
6315                 zoneid_t        zoneid;
6316                 boolean_t       need_copyout;
6317 
6318                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6319                         return (EFAULT);
6320                 need_copyout = (zoneid == ALL_ZONES);
6321                 err = zone_check_datalink(&zoneid,
6322                     (datalink_id_t)(uintptr_t)arg2);
6323                 if (err == 0 && need_copyout) {
6324                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6325                                 err = EFAULT;
6326                 }
6327                 return (err == 0 ? 0 : set_errno(err));
6328         }
6329         case ZONE_LIST_DATALINK:
6330                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6331                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6332         default:
6333                 return (set_errno(EINVAL));
6334         }
6335 }
6336 
6337 struct zarg {
6338         zone_t *zone;
6339         zone_cmd_arg_t arg;
6340 };
6341 
6342 static int
6343 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6344 {
6345         char *buf;
6346         size_t buflen;
6347         int error;
6348 
6349         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6350         buf = kmem_alloc(buflen, KM_SLEEP);
6351         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6352         error = door_ki_open(buf, doorp);
6353         kmem_free(buf, buflen);
6354         return (error);
6355 }
6356 
6357 static void
6358 zone_release_door(door_handle_t *doorp)
6359 {
6360         door_ki_rele(*doorp);
6361         *doorp = NULL;
6362 }
6363 
6364 static void
6365 zone_ki_call_zoneadmd(struct zarg *zargp)
6366 {
6367         door_handle_t door = NULL;
6368         door_arg_t darg, save_arg;
6369         char *zone_name;
6370         size_t zone_namelen;
6371         zoneid_t zoneid;
6372         zone_t *zone;
6373         zone_cmd_arg_t arg;
6374         uint64_t uniqid;
6375         size_t size;
6376         int error;
6377         int retry;
6378 
6379         zone = zargp->zone;
6380         arg = zargp->arg;
6381         kmem_free(zargp, sizeof (*zargp));
6382 
6383         zone_namelen = strlen(zone->zone_name) + 1;
6384         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6385         bcopy(zone->zone_name, zone_name, zone_namelen);
6386         zoneid = zone->zone_id;
6387         uniqid = zone->zone_uniqid;
6388         /*
6389          * zoneadmd may be down, but at least we can empty out the zone.
6390          * We can ignore the return value of zone_empty() since we're called
6391          * from a kernel thread and know we won't be delivered any signals.
6392          */
6393         ASSERT(curproc == &p0);
6394         (void) zone_empty(zone);
6395         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6396         zone_rele(zone);
6397 
6398         size = sizeof (arg);
6399         darg.rbuf = (char *)&arg;
6400         darg.data_ptr = (char *)&arg;
6401         darg.rsize = size;
6402         darg.data_size = size;
6403         darg.desc_ptr = NULL;
6404         darg.desc_num = 0;
6405 
6406         save_arg = darg;
6407         /*
6408          * Since we're not holding a reference to the zone, any number of
6409          * things can go wrong, including the zone disappearing before we get a
6410          * chance to talk to zoneadmd.
6411          */
6412         for (retry = 0; /* forever */; retry++) {
6413                 if (door == NULL &&
6414                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6415                         goto next;
6416                 }
6417                 ASSERT(door != NULL);
6418 
6419                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6420                     SIZE_MAX, 0)) == 0) {
6421                         break;
6422                 }
6423                 switch (error) {
6424                 case EINTR:
6425                         /* FALLTHROUGH */
6426                 case EAGAIN:    /* process may be forking */
6427                         /*
6428                          * Back off for a bit
6429                          */
6430                         break;
6431                 case EBADF:
6432                         zone_release_door(&door);
6433                         if (zone_lookup_door(zone_name, &door) != 0) {
6434                                 /*
6435                                  * zoneadmd may be dead, but it may come back to
6436                                  * life later.
6437                                  */
6438                                 break;
6439                         }
6440                         break;
6441                 default:
6442                         cmn_err(CE_WARN,
6443                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6444                             error);
6445                         goto out;
6446                 }
6447 next:
6448                 /*
6449                  * If this isn't the same zone_t that we originally had in mind,
6450                  * then this is the same as if two kadmin requests come in at
6451                  * the same time: the first one wins.  This means we lose, so we
6452                  * bail.
6453                  */
6454                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6455                         /*
6456                          * Problem is solved.
6457                          */
6458                         break;
6459                 }
6460                 if (zone->zone_uniqid != uniqid) {
6461                         /*
6462                          * zoneid recycled
6463                          */
6464                         zone_rele(zone);
6465                         break;
6466                 }
6467                 /*
6468                  * We could zone_status_timedwait(), but there doesn't seem to
6469                  * be much point in doing that (plus, it would mean that
6470                  * zone_free() isn't called until this thread exits).
6471                  */
6472                 zone_rele(zone);
6473                 delay(hz);
6474                 darg = save_arg;
6475         }
6476 out:
6477         if (door != NULL) {
6478                 zone_release_door(&door);
6479         }
6480         kmem_free(zone_name, zone_namelen);
6481         thread_exit();
6482 }
6483 
6484 /*
6485  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6486  * kadmin().  The caller is a process in the zone.
6487  *
6488  * In order to shutdown the zone, we will hand off control to zoneadmd
6489  * (running in the global zone) via a door.  We do a half-hearted job at
6490  * killing all processes in the zone, create a kernel thread to contact
6491  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6492  * a form of generation number used to let zoneadmd (as well as
6493  * zone_destroy()) know exactly which zone they're re talking about.
6494  */
6495 int
6496 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6497 {
6498         struct zarg *zargp;
6499         zone_cmd_t zcmd;
6500         zone_t *zone;
6501 
6502         zone = curproc->p_zone;
6503         ASSERT(getzoneid() != GLOBAL_ZONEID);
6504 
6505         switch (cmd) {
6506         case A_SHUTDOWN:
6507                 switch (fcn) {
6508                 case AD_HALT:
6509                 case AD_POWEROFF:
6510                         zcmd = Z_HALT;
6511                         break;
6512                 case AD_BOOT:
6513                         zcmd = Z_REBOOT;
6514                         break;
6515                 case AD_IBOOT:
6516                 case AD_SBOOT:
6517                 case AD_SIBOOT:
6518                 case AD_NOSYNC:
6519                         return (ENOTSUP);
6520                 default:
6521                         return (EINVAL);
6522                 }
6523                 break;
6524         case A_REBOOT:
6525                 zcmd = Z_REBOOT;
6526                 break;
6527         case A_FTRACE:
6528         case A_REMOUNT:
6529         case A_FREEZE:
6530         case A_DUMP:
6531         case A_CONFIG:
6532                 return (ENOTSUP);
6533         default:
6534                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6535                 return (EINVAL);
6536         }
6537 
6538         if (secpolicy_zone_admin(credp, B_FALSE))
6539                 return (EPERM);
6540         mutex_enter(&zone_status_lock);
6541 
6542         /*
6543          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6544          * is in the zone.
6545          */
6546         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6547         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6548                 /*
6549                  * This zone is already on its way down.
6550                  */
6551                 mutex_exit(&zone_status_lock);
6552                 return (0);
6553         }
6554         /*
6555          * Prevent future zone_enter()s
6556          */
6557         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6558         mutex_exit(&zone_status_lock);
6559 
6560         /*
6561          * Kill everyone now and call zoneadmd later.
6562          * zone_ki_call_zoneadmd() will do a more thorough job of this
6563          * later.
6564          */
6565         killall(zone->zone_id);
6566         /*
6567          * Now, create the thread to contact zoneadmd and do the rest of the
6568          * work.  This thread can't be created in our zone otherwise
6569          * zone_destroy() would deadlock.
6570          */
6571         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6572         zargp->arg.cmd = zcmd;
6573         zargp->arg.uniqid = zone->zone_uniqid;
6574         zargp->zone = zone;
6575         (void) strcpy(zargp->arg.locale, "C");
6576         /* mdep was already copied in for us by uadmin */
6577         if (mdep != NULL)
6578                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6579                     sizeof (zargp->arg.bootbuf));
6580         zone_hold(zone);
6581 
6582         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6583             TS_RUN, minclsyspri);
6584         exit(CLD_EXITED, 0);
6585 
6586         return (EINVAL);
6587 }
6588 
6589 /*
6590  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6591  * status to ZONE_IS_SHUTTING_DOWN.
6592  *
6593  * This function also shuts down all running zones to ensure that they won't
6594  * fork new processes.
6595  */
6596 void
6597 zone_shutdown_global(void)
6598 {
6599         zone_t *current_zonep;
6600 
6601         ASSERT(INGLOBALZONE(curproc));
6602         mutex_enter(&zonehash_lock);
6603         mutex_enter(&zone_status_lock);
6604 
6605         /* Modify the global zone's status first. */
6606         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6607         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6608 
6609         /*
6610          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6611          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6612          * could cause assertions to fail (e.g., assertions about a zone's
6613          * state during initialization, readying, or booting) or produce races.
6614          * We'll let threads continue to initialize and ready new zones: they'll
6615          * fail to boot the new zones when they see that the global zone is
6616          * shutting down.
6617          */
6618         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6619             current_zonep = list_next(&zone_active, current_zonep)) {
6620                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6621                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6622         }
6623         mutex_exit(&zone_status_lock);
6624         mutex_exit(&zonehash_lock);
6625 }
6626 
6627 /*
6628  * Returns true if the named dataset is visible in the current zone.
6629  * The 'write' parameter is set to 1 if the dataset is also writable.
6630  */
6631 int
6632 zone_dataset_visible(const char *dataset, int *write)
6633 {
6634         static int zfstype = -1;
6635         zone_dataset_t *zd;
6636         size_t len;
6637         zone_t *zone = curproc->p_zone;
6638         const char *name = NULL;
6639         vfs_t *vfsp = NULL;
6640 
6641         if (dataset[0] == '\0')
6642                 return (0);
6643 
6644         /*
6645          * Walk the list once, looking for datasets which match exactly, or
6646          * specify a dataset underneath an exported dataset.  If found, return
6647          * true and note that it is writable.
6648          */
6649         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6650             zd = list_next(&zone->zone_datasets, zd)) {
6651 
6652                 len = strlen(zd->zd_dataset);
6653                 if (strlen(dataset) >= len &&
6654                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6655                     (dataset[len] == '\0' || dataset[len] == '/' ||
6656                     dataset[len] == '@')) {
6657                         if (write)
6658                                 *write = 1;
6659                         return (1);
6660                 }
6661         }
6662 
6663         /*
6664          * Walk the list a second time, searching for datasets which are parents
6665          * of exported datasets.  These should be visible, but read-only.
6666          *
6667          * Note that we also have to support forms such as 'pool/dataset/', with
6668          * a trailing slash.
6669          */
6670         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6671             zd = list_next(&zone->zone_datasets, zd)) {
6672 
6673                 len = strlen(dataset);
6674                 if (dataset[len - 1] == '/')
6675                         len--;  /* Ignore trailing slash */
6676                 if (len < strlen(zd->zd_dataset) &&
6677                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6678                     zd->zd_dataset[len] == '/') {
6679                         if (write)
6680                                 *write = 0;
6681                         return (1);
6682                 }
6683         }
6684 
6685         /*
6686          * We reach here if the given dataset is not found in the zone_dataset
6687          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6688          * instead of delegation. For this we search for the dataset in the
6689          * zone_vfslist of this zone. If found, return true and note that it is
6690          * not writable.
6691          */
6692 
6693         /*
6694          * Initialize zfstype if it is not initialized yet.
6695          */
6696         if (zfstype == -1) {
6697                 struct vfssw *vswp = vfs_getvfssw("zfs");
6698                 zfstype = vswp - vfssw;
6699                 vfs_unrefvfssw(vswp);
6700         }
6701 
6702         vfs_list_read_lock();
6703         vfsp = zone->zone_vfslist;
6704         do {
6705                 ASSERT(vfsp);
6706                 if (vfsp->vfs_fstype == zfstype) {
6707                         name = refstr_value(vfsp->vfs_resource);
6708 
6709                         /*
6710                          * Check if we have an exact match.
6711                          */
6712                         if (strcmp(dataset, name) == 0) {
6713                                 vfs_list_unlock();
6714                                 if (write)
6715                                         *write = 0;
6716                                 return (1);
6717                         }
6718                         /*
6719                          * We need to check if we are looking for parents of
6720                          * a dataset. These should be visible, but read-only.
6721                          */
6722                         len = strlen(dataset);
6723                         if (dataset[len - 1] == '/')
6724                                 len--;
6725 
6726                         if (len < strlen(name) &&
6727                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6728                                 vfs_list_unlock();
6729                                 if (write)
6730                                         *write = 0;
6731                                 return (1);
6732                         }
6733                 }
6734                 vfsp = vfsp->vfs_zone_next;
6735         } while (vfsp != zone->zone_vfslist);
6736 
6737         vfs_list_unlock();
6738         return (0);
6739 }
6740 
6741 /*
6742  * zone_find_by_any_path() -
6743  *
6744  * kernel-private routine similar to zone_find_by_path(), but which
6745  * effectively compares against zone paths rather than zonerootpath
6746  * (i.e., the last component of zonerootpaths, which should be "root/",
6747  * are not compared.)  This is done in order to accurately identify all
6748  * paths, whether zone-visible or not, including those which are parallel
6749  * to /root/, such as /dev/, /home/, etc...
6750  *
6751  * If the specified path does not fall under any zone path then global
6752  * zone is returned.
6753  *
6754  * The treat_abs parameter indicates whether the path should be treated as
6755  * an absolute path although it does not begin with "/".  (This supports
6756  * nfs mount syntax such as host:any/path.)
6757  *
6758  * The caller is responsible for zone_rele of the returned zone.
6759  */
6760 zone_t *
6761 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6762 {
6763         zone_t *zone;
6764         int path_offset = 0;
6765 
6766         if (path == NULL) {
6767                 zone_hold(global_zone);
6768                 return (global_zone);
6769         }
6770 
6771         if (*path != '/') {
6772                 ASSERT(treat_abs);
6773                 path_offset = 1;
6774         }
6775 
6776         mutex_enter(&zonehash_lock);
6777         for (zone = list_head(&zone_active); zone != NULL;
6778             zone = list_next(&zone_active, zone)) {
6779                 char    *c;
6780                 size_t  pathlen;
6781                 char *rootpath_start;
6782 
6783                 if (zone == global_zone)        /* skip global zone */
6784                         continue;
6785 
6786                 /* scan backwards to find start of last component */
6787                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6788                 do {
6789                         c--;
6790                 } while (*c != '/');
6791 
6792                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6793                 rootpath_start = (zone->zone_rootpath + path_offset);
6794                 if (strncmp(path, rootpath_start, pathlen) == 0)
6795                         break;
6796         }
6797         if (zone == NULL)
6798                 zone = global_zone;
6799         zone_hold(zone);
6800         mutex_exit(&zonehash_lock);
6801         return (zone);
6802 }
6803 
6804 /*
6805  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6806  * zone_dl_t pointer if found, and NULL otherwise.
6807  */
6808 static zone_dl_t *
6809 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6810 {
6811         zone_dl_t *zdl;
6812 
6813         ASSERT(mutex_owned(&zone->zone_lock));
6814         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6815             zdl = list_next(&zone->zone_dl_list, zdl)) {
6816                 if (zdl->zdl_id == linkid)
6817                         break;
6818         }
6819         return (zdl);
6820 }
6821 
6822 static boolean_t
6823 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6824 {
6825         boolean_t exists;
6826 
6827         mutex_enter(&zone->zone_lock);
6828         exists = (zone_find_dl(zone, linkid) != NULL);
6829         mutex_exit(&zone->zone_lock);
6830         return (exists);
6831 }
6832 
6833 /*
6834  * Add an data link name for the zone.
6835  */
6836 static int
6837 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6838 {
6839         zone_dl_t *zdl;
6840         zone_t *zone;
6841         zone_t *thiszone;
6842 
6843         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6844                 return (set_errno(ENXIO));
6845 
6846         /* Verify that the datalink ID doesn't already belong to a zone. */
6847         mutex_enter(&zonehash_lock);
6848         for (zone = list_head(&zone_active); zone != NULL;
6849             zone = list_next(&zone_active, zone)) {
6850                 if (zone_dl_exists(zone, linkid)) {
6851                         mutex_exit(&zonehash_lock);
6852                         zone_rele(thiszone);
6853                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6854                 }
6855         }
6856 
6857         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6858         zdl->zdl_id = linkid;
6859         zdl->zdl_net = NULL;
6860         mutex_enter(&thiszone->zone_lock);
6861         list_insert_head(&thiszone->zone_dl_list, zdl);
6862         mutex_exit(&thiszone->zone_lock);
6863         mutex_exit(&zonehash_lock);
6864         zone_rele(thiszone);
6865         return (0);
6866 }
6867 
6868 static int
6869 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6870 {
6871         zone_dl_t *zdl;
6872         zone_t *zone;
6873         int err = 0;
6874 
6875         if ((zone = zone_find_by_id(zoneid)) == NULL)
6876                 return (set_errno(EINVAL));
6877 
6878         mutex_enter(&zone->zone_lock);
6879         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6880                 err = ENXIO;
6881         } else {
6882                 list_remove(&zone->zone_dl_list, zdl);
6883                 if (zdl->zdl_net != NULL)
6884                         nvlist_free(zdl->zdl_net);
6885                 kmem_free(zdl, sizeof (zone_dl_t));
6886         }
6887         mutex_exit(&zone->zone_lock);
6888         zone_rele(zone);
6889         return (err == 0 ? 0 : set_errno(err));
6890 }
6891 
6892 /*
6893  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6894  * the linkid.  Otherwise we just check if the specified zoneidp has been
6895  * assigned the supplied linkid.
6896  */
6897 int
6898 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6899 {
6900         zone_t *zone;
6901         int err = ENXIO;
6902 
6903         if (*zoneidp != ALL_ZONES) {
6904                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6905                         if (zone_dl_exists(zone, linkid))
6906                                 err = 0;
6907                         zone_rele(zone);
6908                 }
6909                 return (err);
6910         }
6911 
6912         mutex_enter(&zonehash_lock);
6913         for (zone = list_head(&zone_active); zone != NULL;
6914             zone = list_next(&zone_active, zone)) {
6915                 if (zone_dl_exists(zone, linkid)) {
6916                         *zoneidp = zone->zone_id;
6917                         err = 0;
6918                         break;
6919                 }
6920         }
6921         mutex_exit(&zonehash_lock);
6922         return (err);
6923 }
6924 
6925 /*
6926  * Get the list of datalink IDs assigned to a zone.
6927  *
6928  * On input, *nump is the number of datalink IDs that can fit in the supplied
6929  * idarray.  Upon return, *nump is either set to the number of datalink IDs
6930  * that were placed in the array if the array was large enough, or to the
6931  * number of datalink IDs that the function needs to place in the array if the
6932  * array is too small.
6933  */
6934 static int
6935 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6936 {
6937         uint_t num, dlcount;
6938         zone_t *zone;
6939         zone_dl_t *zdl;
6940         datalink_id_t *idptr = idarray;
6941 
6942         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6943                 return (set_errno(EFAULT));
6944         if ((zone = zone_find_by_id(zoneid)) == NULL)
6945                 return (set_errno(ENXIO));
6946 
6947         num = 0;
6948         mutex_enter(&zone->zone_lock);
6949         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6950             zdl = list_next(&zone->zone_dl_list, zdl)) {
6951                 /*
6952                  * If the list is bigger than what the caller supplied, just
6953                  * count, don't do copyout.
6954                  */
6955                 if (++num > dlcount)
6956                         continue;
6957                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6958                         mutex_exit(&zone->zone_lock);
6959                         zone_rele(zone);
6960                         return (set_errno(EFAULT));
6961                 }
6962                 idptr++;
6963         }
6964         mutex_exit(&zone->zone_lock);
6965         zone_rele(zone);
6966 
6967         /* Increased or decreased, caller should be notified. */
6968         if (num != dlcount) {
6969                 if (copyout(&num, nump, sizeof (num)) != 0)
6970                         return (set_errno(EFAULT));
6971         }
6972         return (0);
6973 }
6974 
6975 /*
6976  * Public interface for looking up a zone by zoneid. It's a customized version
6977  * for netstack_zone_create(). It can only be called from the zsd create
6978  * callbacks, since it doesn't have reference on the zone structure hence if
6979  * it is called elsewhere the zone could disappear after the zonehash_lock
6980  * is dropped.
6981  *
6982  * Furthermore it
6983  * 1. Doesn't check the status of the zone.
6984  * 2. It will be called even before zone_init is called, in that case the
6985  *    address of zone0 is returned directly, and netstack_zone_create()
6986  *    will only assign a value to zone0.zone_netstack, won't break anything.
6987  * 3. Returns without the zone being held.
6988  */
6989 zone_t *
6990 zone_find_by_id_nolock(zoneid_t zoneid)
6991 {
6992         zone_t *zone;
6993 
6994         mutex_enter(&zonehash_lock);
6995         if (zonehashbyid == NULL)
6996                 zone = &zone0;
6997         else
6998                 zone = zone_find_all_by_id(zoneid);
6999         mutex_exit(&zonehash_lock);
7000         return (zone);
7001 }
7002 
7003 /*
7004  * Walk the datalinks for a given zone
7005  */
7006 int
7007 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7008     void *data)
7009 {
7010         zone_t          *zone;
7011         zone_dl_t       *zdl;
7012         datalink_id_t   *idarray;
7013         uint_t          idcount = 0;
7014         int             i, ret = 0;
7015 
7016         if ((zone = zone_find_by_id(zoneid)) == NULL)
7017                 return (ENOENT);
7018 
7019         /*
7020          * We first build an array of linkid's so that we can walk these and
7021          * execute the callback with the zone_lock dropped.
7022          */
7023         mutex_enter(&zone->zone_lock);
7024         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7025             zdl = list_next(&zone->zone_dl_list, zdl)) {
7026                 idcount++;
7027         }
7028 
7029         if (idcount == 0) {
7030                 mutex_exit(&zone->zone_lock);
7031                 zone_rele(zone);
7032                 return (0);
7033         }
7034 
7035         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7036         if (idarray == NULL) {
7037                 mutex_exit(&zone->zone_lock);
7038                 zone_rele(zone);
7039                 return (ENOMEM);
7040         }
7041 
7042         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7043             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7044                 idarray[i] = zdl->zdl_id;
7045         }
7046 
7047         mutex_exit(&zone->zone_lock);
7048 
7049         for (i = 0; i < idcount && ret == 0; i++) {
7050                 if ((ret = (*cb)(idarray[i], data)) != 0)
7051                         break;
7052         }
7053 
7054         zone_rele(zone);
7055         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7056         return (ret);
7057 }
7058 
7059 static char *
7060 zone_net_type2name(int type)
7061 {
7062         switch (type) {
7063         case ZONE_NETWORK_ADDRESS:
7064                 return (ZONE_NET_ADDRNAME);
7065         case ZONE_NETWORK_DEFROUTER:
7066                 return (ZONE_NET_RTRNAME);
7067         default:
7068                 return (NULL);
7069         }
7070 }
7071 
7072 static int
7073 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7074 {
7075         zone_t *zone;
7076         zone_dl_t *zdl;
7077         nvlist_t *nvl;
7078         int err = 0;
7079         uint8_t *new = NULL;
7080         char *nvname;
7081         int bufsize;
7082         datalink_id_t linkid = znbuf->zn_linkid;
7083 
7084         if (secpolicy_zone_config(CRED()) != 0)
7085                 return (set_errno(EPERM));
7086 
7087         if (zoneid == GLOBAL_ZONEID)
7088                 return (set_errno(EINVAL));
7089 
7090         nvname = zone_net_type2name(znbuf->zn_type);
7091         bufsize = znbuf->zn_len;
7092         new = znbuf->zn_val;
7093         if (nvname == NULL)
7094                 return (set_errno(EINVAL));
7095 
7096         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7097                 return (set_errno(EINVAL));
7098         }
7099 
7100         mutex_enter(&zone->zone_lock);
7101         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7102                 err = ENXIO;
7103                 goto done;
7104         }
7105         if ((nvl = zdl->zdl_net) == NULL) {
7106                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7107                         err = ENOMEM;
7108                         goto done;
7109                 } else {
7110                         zdl->zdl_net = nvl;
7111                 }
7112         }
7113         if (nvlist_exists(nvl, nvname)) {
7114                 err = EINVAL;
7115                 goto done;
7116         }
7117         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7118         ASSERT(err == 0);
7119 done:
7120         mutex_exit(&zone->zone_lock);
7121         zone_rele(zone);
7122         if (err != 0)
7123                 return (set_errno(err));
7124         else
7125                 return (0);
7126 }
7127 
7128 static int
7129 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7130 {
7131         zone_t *zone;
7132         zone_dl_t *zdl;
7133         nvlist_t *nvl;
7134         uint8_t *ptr;
7135         uint_t psize;
7136         int err = 0;
7137         char *nvname;
7138         int bufsize;
7139         void *buf;
7140         datalink_id_t linkid = znbuf->zn_linkid;
7141 
7142         if (zoneid == GLOBAL_ZONEID)
7143                 return (set_errno(EINVAL));
7144 
7145         nvname = zone_net_type2name(znbuf->zn_type);
7146         bufsize = znbuf->zn_len;
7147         buf = znbuf->zn_val;
7148 
7149         if (nvname == NULL)
7150                 return (set_errno(EINVAL));
7151         if ((zone = zone_find_by_id(zoneid)) == NULL)
7152                 return (set_errno(EINVAL));
7153 
7154         mutex_enter(&zone->zone_lock);
7155         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7156                 err = ENXIO;
7157                 goto done;
7158         }
7159         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7160                 err = ENOENT;
7161                 goto done;
7162         }
7163         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7164         ASSERT(err == 0);
7165 
7166         if (psize > bufsize) {
7167                 err = ENOBUFS;
7168                 goto done;
7169         }
7170         znbuf->zn_len = psize;
7171         bcopy(ptr, buf, psize);
7172 done:
7173         mutex_exit(&zone->zone_lock);
7174         zone_rele(zone);
7175         if (err != 0)
7176                 return (set_errno(err));
7177         else
7178                 return (0);
7179 }