illumos-gate Wdiff usr/src/uts/common/os/zone.c

Print this page

Add boot_hrtime to global and zone kstats.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2013, Joyent Inc. All rights reserved.
       25 + * Copyright 2016 Garrett D'Amore
  25   26   */
  26   27  
  27   28  /*
  28   29   * Zones
  29   30   *
  30   31   *   A zone is a named collection of processes, namespace constraints,
  31   32   *   and other system resources which comprise a secure and manageable
  32   33   *   application containment facility.
  33   34   *
  34   35   *   Zones (represented by the reference counted zone_t) are tracked in

  35   36   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36   37   *   (zoneid_t) are used to track zone association.  Zone IDs are
  37   38   *   dynamically generated when the zone is created; if a persistent
  38   39   *   identifier is needed (core files, accounting logs, audit trail,
  39   40   *   etc.), the zone name should be used.
  40   41   *
  41   42   *
  42   43   *   Global Zone:
  43   44   *
  44   45   *   The global zone (zoneid 0) is automatically associated with all
  45   46   *   system resources that have not been bound to a user-created zone.
  46   47   *   This means that even systems where zones are not in active use
  47   48   *   have a global zone, and all processes, mounts, etc. are
  48   49   *   associated with that zone.  The global zone is generally
  49   50   *   unconstrained in terms of privileges and access, though the usual
  50   51   *   credential and privilege based restrictions apply.
  51   52   *
  52   53   *
  53   54   *   Zone States:
  54   55   *
  55   56   *   The states in which a zone may be in and the transitions are as
  56   57   *   follows:
  57   58   *
  58   59   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59   60   *   initialized zone is added to the list of active zones on the system but
  60   61   *   isn't accessible.
  61   62   *
  62   63   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63   64   *   not yet completed. Not possible to enter the zone, but attributes can
  64   65   *   be retrieved.
  65   66   *
  66   67   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67   68   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68   69   *   executed.  A zone remains in this state until it transitions into
  69   70   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70   71   *
  71   72   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72   73   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73   74   *   state.
  74   75   *
  75   76   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76   77   *   successfully started init.   A zone remains in this state until
  77   78   *   zone_shutdown() is called.
  78   79   *
  79   80   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80   81   *   killing all processes running in the zone. The zone remains
  81   82   *   in this state until there are no more user processes running in the zone.
  82   83   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83   84   *   Since zone_shutdown() is restartable, it may be called successfully
  84   85   *   multiple times for the same zone_t.  Setting of the zone's state to
  85   86   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86   87   *   the zone's status without worrying about it being a moving target.
  87   88   *
  88   89   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89   90   *   are no more user processes in the zone.  The zone remains in this
  90   91   *   state until there are no more kernel threads associated with the
  91   92   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92   93   *   fail.
  93   94   *
  94   95   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95   96   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96   97   *   join the zone or create kernel threads therein.
  97   98   *
  98   99   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99  100   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  101   *   return NULL from now on.
 101  102   *
 102  103   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  104   *   processes or threads doing work on behalf of the zone.  The zone is
 104  105   *   removed from the list of active zones.  zone_destroy() returns, and
 105  106   *   the zone can be recreated.
 106  107   *
 107  108   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  109   *   callbacks are executed, and all memory associated with the zone is
 109  110   *   freed.
 110  111   *
 111  112   *   Threads can wait for the zone to enter a requested state by using
 112  113   *   zone_status_wait() or zone_status_timedwait() with the desired
 113  114   *   state passed in as an argument.  Zone state transitions are
 114  115   *   uni-directional; it is not possible to move back to an earlier state.
 115  116   *
 116  117   *
 117  118   *   Zone-Specific Data:
 118  119   *
 119  120   *   Subsystems needing to maintain zone-specific data can store that
 120  121   *   data using the ZSD mechanism.  This provides a zone-specific data
 121  122   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  123   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  124   *   to register callbacks to be invoked when a zone is created, shut
 124  125   *   down, or destroyed.  This can be used to initialize zone-specific
 125  126   *   data for new zones and to clean up when zones go away.
 126  127   *
 127  128   *
 128  129   *   Data Structures:
 129  130   *
 130  131   *   The per-zone structure (zone_t) is reference counted, and freed
 131  132   *   when all references are released.  zone_hold and zone_rele can be
 132  133   *   used to adjust the reference count.  In addition, reference counts
 133  134   *   associated with the cred_t structure are tracked separately using
 134  135   *   zone_cred_hold and zone_cred_rele.
 135  136   *
 136  137   *   Pointers to active zone_t's are stored in two hash tables; one
 137  138   *   for searching by id, the other for searching by name.  Lookups
 138  139   *   can be performed on either basis, using zone_find_by_id and
 139  140   *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  141   *   held, so zone_rele should be called when the pointer is no longer
 141  142   *   needed.  Zones can also be searched by path; zone_find_by_path
 142  143   *   returns the zone with which a path name is associated (global
 143  144   *   zone if the path is not within some other zone's file system
 144  145   *   hierarchy).  This currently requires iterating through each zone,
 145  146   *   so it is slower than an id or name search via a hash table.
 146  147   *
 147  148   *
 148  149   *   Locking:
 149  150   *
 150  151   *   zonehash_lock: This is a top-level global lock used to protect the
 151  152   *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  153   *       while this lock is held.
 153  154   *   zone_status_lock: This is a global lock protecting zone state.
 154  155   *       Zones cannot change state while this lock is held.  It also
 155  156   *       protects the list of kernel threads associated with a zone.
 156  157   *   zone_lock: This is a per-zone lock used to protect several fields of
 157  158   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  159   *       this lock means that the zone cannot go away.
 159  160   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  161   *       related to the zone.max-lwps rctl.
 161  162   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  163   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  164   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  165   *       currently just max_lofi
 165  166   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  167   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  168   *       list (a list of zones in the ZONE_IS_DEAD state).
 168  169   *
 169  170   *   Ordering requirements:
 170  171   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  172   *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  173   *
 173  174   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  175   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  176   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  177   *
 177  178   *   Blocking memory allocations are permitted while holding any of the
 178  179   *   zone locks.
 179  180   *
 180  181   *
 181  182   *   System Call Interface:
 182  183   *
 183  184   *   The zone subsystem can be managed and queried from user level with
 184  185   *   the following system calls (all subcodes of the primary "zone"
 185  186   *   system call):
 186  187   *   - zone_create: creates a zone with selected attributes (name,
 187  188   *     root path, privileges, resource controls, ZFS datasets)
 188  189   *   - zone_enter: allows the current process to enter a zone
 189  190   *   - zone_getattr: reports attributes of a zone
 190  191   *   - zone_setattr: set attributes of a zone
 191  192   *   - zone_boot: set 'init' running for the zone
 192  193   *   - zone_list: lists all zones active in the system
 193  194   *   - zone_lookup: looks up zone id based on name
 194  195   *   - zone_shutdown: initiates shutdown process (see states above)
 195  196   *   - zone_destroy: completes shutdown process (see states above)
 196  197   *
 197  198   */
 198  199  
 199  200  #include <sys/priv_impl.h>
 200  201  #include <sys/cred.h>
 201  202  #include <c2/audit.h>
 202  203  #include <sys/debug.h>
 203  204  #include <sys/file.h>
 204  205  #include <sys/kmem.h>
 205  206  #include <sys/kstat.h>
 206  207  #include <sys/mutex.h>
 207  208  #include <sys/note.h>
 208  209  #include <sys/pathname.h>
 209  210  #include <sys/proc.h>
 210  211  #include <sys/project.h>
 211  212  #include <sys/sysevent.h>
 212  213  #include <sys/task.h>
 213  214  #include <sys/systm.h>
 214  215  #include <sys/types.h>
 215  216  #include <sys/utsname.h>
 216  217  #include <sys/vnode.h>
 217  218  #include <sys/vfs.h>
 218  219  #include <sys/systeminfo.h>
 219  220  #include <sys/policy.h>
 220  221  #include <sys/cred_impl.h>
 221  222  #include <sys/contract_impl.h>
 222  223  #include <sys/contract/process_impl.h>
 223  224  #include <sys/class.h>
 224  225  #include <sys/pool.h>
 225  226  #include <sys/pool_pset.h>
 226  227  #include <sys/pset.h>
 227  228  #include <sys/strlog.h>
 228  229  #include <sys/sysmacros.h>
 229  230  #include <sys/callb.h>
 230  231  #include <sys/vmparam.h>
 231  232  #include <sys/corectl.h>
 232  233  #include <sys/ipc_impl.h>
 233  234  #include <sys/klpd.h>
 234  235  
 235  236  #include <sys/door.h>
 236  237  #include <sys/cpuvar.h>
 237  238  #include <sys/sdt.h>
 238  239  
 239  240  #include <sys/uadmin.h>
 240  241  #include <sys/session.h>
 241  242  #include <sys/cmn_err.h>
 242  243  #include <sys/modhash.h>
 243  244  #include <sys/sunddi.h>
 244  245  #include <sys/nvpair.h>
 245  246  #include <sys/rctl.h>
 246  247  #include <sys/fss.h>
 247  248  #include <sys/brand.h>
 248  249  #include <sys/zone.h>
 249  250  #include <net/if.h>
 250  251  #include <sys/cpucaps.h>
 251  252  #include <vm/seg.h>
 252  253  #include <sys/mac.h>
 253  254  
 254  255  /*
 255  256   * This constant specifies the number of seconds that threads waiting for
 256  257   * subsystems to release a zone's general-purpose references will wait before
 257  258   * they log the zone's reference counts.  The constant's value shouldn't
 258  259   * be so small that reference counts are unnecessarily reported for zones
 259  260   * whose references are slowly released.  On the other hand, it shouldn't be so
 260  261   * large that users reboot their systems out of frustration over hung zones
 261  262   * before the system logs the zones' reference counts.
 262  263   */
 263  264  #define ZONE_DESTROY_TIMEOUT_SECS       60
 264  265  
 265  266  /* List of data link IDs which are accessible from the zone */
 266  267  typedef struct zone_dl {
 267  268          datalink_id_t   zdl_id;
 268  269          nvlist_t        *zdl_net;
 269  270          list_node_t     zdl_linkage;
 270  271  } zone_dl_t;
 271  272  
 272  273  /*
 273  274   * cv used to signal that all references to the zone have been released.  This
 274  275   * needs to be global since there may be multiple waiters, and the first to
 275  276   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  277   */
 277  278  static kcondvar_t zone_destroy_cv;
 278  279  /*
 279  280   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  281   * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  282   */
 282  283  static kmutex_t zone_status_lock;
 283  284  
 284  285  /*
 285  286   * ZSD-related global variables.
 286  287   */
 287  288  static kmutex_t zsd_key_lock;   /* protects the following two */
 288  289  /*
 289  290   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  291   */
 291  292  static zone_key_t zsd_keyval = 0;
 292  293  /*
 293  294   * Global list of registered keys.  We use this when a new zone is created.
 294  295   */
 295  296  static list_t zsd_registered_keys;
 296  297  
 297  298  int zone_hash_size = 256;
 298  299  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299  300  static kmutex_t zonehash_lock;
 300  301  static uint_t zonecount;
 301  302  static id_space_t *zoneid_space;
 302  303  
 303  304  /*
 304  305   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  306   * kernel proper runs, and which manages all other zones.
 306  307   *
 307  308   * Although not declared as static, the variable "zone0" should not be used
 308  309   * except for by code that needs to reference the global zone early on in boot,
 309  310   * before it is fully initialized.  All other consumers should use
 310  311   * 'global_zone'.
 311  312   */
 312  313  zone_t zone0;
 313  314  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314  315  
 315  316  /*
 316  317   * List of active zones, protected by zonehash_lock.
 317  318   */
 318  319  static list_t zone_active;
 319  320  
 320  321  /*
 321  322   * List of destroyed zones that still have outstanding cred references.
 322  323   * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  324   * problems in zone_free.
 324  325   */
 325  326  static list_t zone_deathrow;
 326  327  static kmutex_t zone_deathrow_lock;
 327  328  
 328  329  /* number of zones is limited by virtual interface limit in IP */
 329  330  uint_t maxzones = 8192;
 330  331  
 331  332  /* Event channel to sent zone state change notifications */
 332  333  evchan_t *zone_event_chan;
 333  334  
 334  335  /*
 335  336   * This table holds the mapping from kernel zone states to
 336  337   * states visible in the state notification API.
 337  338   * The idea is that we only expose "obvious" states and
 338  339   * do not expose states which are just implementation details.
 339  340   */
 340  341  const char  *zone_status_table[] = {
 341  342          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342  343          ZONE_EVENT_INITIALIZED,         /* initialized */
 343  344          ZONE_EVENT_READY,               /* ready */
 344  345          ZONE_EVENT_READY,               /* booting */
 345  346          ZONE_EVENT_RUNNING,             /* running */
 346  347          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347  348          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348  349          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349  350          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350  351          ZONE_EVENT_UNINITIALIZED,       /* dead */
 351  352  };
 352  353  
 353  354  /*
 354  355   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  356   * (see sys/zone.h).
 356  357   */
 357  358  static char *zone_ref_subsys_names[] = {
 358  359          "NFS",          /* ZONE_REF_NFS */
 359  360          "NFSv4",        /* ZONE_REF_NFSV4 */
 360  361          "SMBFS",        /* ZONE_REF_SMBFS */
 361  362          "MNTFS",        /* ZONE_REF_MNTFS */
 362  363          "LOFI",         /* ZONE_REF_LOFI */
 363  364          "VFS",          /* ZONE_REF_VFS */
 364  365          "IPC"           /* ZONE_REF_IPC */
 365  366  };
 366  367  
 367  368  /*
 368  369   * This isn't static so lint doesn't complain.
 369  370   */
 370  371  rctl_hndl_t rc_zone_cpu_shares;
 371  372  rctl_hndl_t rc_zone_locked_mem;
 372  373  rctl_hndl_t rc_zone_max_swap;
 373  374  rctl_hndl_t rc_zone_max_lofi;
 374  375  rctl_hndl_t rc_zone_cpu_cap;
 375  376  rctl_hndl_t rc_zone_nlwps;
 376  377  rctl_hndl_t rc_zone_nprocs;
 377  378  rctl_hndl_t rc_zone_shmmax;
 378  379  rctl_hndl_t rc_zone_shmmni;
 379  380  rctl_hndl_t rc_zone_semmni;
 380  381  rctl_hndl_t rc_zone_msgmni;
 381  382  
 382  383  const char * const zone_default_initname = "/sbin/init";
 383  384  static char * const zone_prefix = "/zone/";
 384  385  static int zone_shutdown(zoneid_t zoneid);
 385  386  static int zone_add_datalink(zoneid_t, datalink_id_t);
 386  387  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 387  388  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 388  389  static int zone_set_network(zoneid_t, zone_net_data_t *);
 389  390  static int zone_get_network(zoneid_t, zone_net_data_t *);
 390  391  
 391  392  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 392  393  
 393  394  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 394  395  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 395  396  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396  397  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 397  398      zone_key_t);
 398  399  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 399  400  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 400  401      kmutex_t *);
 401  402  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 402  403      kmutex_t *);
 403  404  
 404  405  /*
 405  406   * Bump this number when you alter the zone syscall interfaces; this is
 406  407   * because we need to have support for previous API versions in libc
 407  408   * to support patching; libc calls into the kernel to determine this number.
 408  409   *
 409  410   * Version 1 of the API is the version originally shipped with Solaris 10
 410  411   * Version 2 alters the zone_create system call in order to support more
 411  412   *     arguments by moving the args into a structure; and to do better
 412  413   *     error reporting when zone_create() fails.
 413  414   * Version 3 alters the zone_create system call in order to support the
 414  415   *     import of ZFS datasets to zones.
 415  416   * Version 4 alters the zone_create system call in order to support
 416  417   *     Trusted Extensions.
 417  418   * Version 5 alters the zone_boot system call, and converts its old
 418  419   *     bootargs parameter to be set by the zone_setattr API instead.
 419  420   * Version 6 adds the flag argument to zone_create.
 420  421   */
 421  422  static const int ZONE_SYSCALL_API_VERSION = 6;
 422  423  
 423  424  /*
 424  425   * Certain filesystems (such as NFS and autofs) need to know which zone
 425  426   * the mount is being placed in.  Because of this, we need to be able to
 426  427   * ensure that a zone isn't in the process of being created/destroyed such
 427  428   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 428  429   * it gets added the list of mounted zones, it ends up on the wrong zone's
 429  430   * mount list. Since a zone can't reside on an NFS file system, we don't
 430  431   * have to worry about the zonepath itself.
 431  432   *
 432  433   * The following functions: block_mounts()/resume_mounts() and
 433  434   * mount_in_progress()/mount_completed() are used by zones and the VFS
 434  435   * layer (respectively) to synchronize zone state transitions and new
 435  436   * mounts within a zone. This syncronization is on a per-zone basis, so
 436  437   * activity for one zone will not interfere with activity for another zone.
 437  438   *
 438  439   * The semantics are like a reader-reader lock such that there may
 439  440   * either be multiple mounts (or zone state transitions, if that weren't
 440  441   * serialized by zonehash_lock) in progress at the same time, but not
 441  442   * both.
 442  443   *
 443  444   * We use cv's so the user can ctrl-C out of the operation if it's
 444  445   * taking too long.
 445  446   *
 446  447   * The semantics are such that there is unfair bias towards the
 447  448   * "current" operation.  This means that zone halt may starve if
 448  449   * there is a rapid succession of new mounts coming in to the zone.
 449  450   */
 450  451  /*
 451  452   * Prevent new mounts from progressing to the point of calling
 452  453   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 453  454   * them to complete.
 454  455   */
 455  456  static int
 456  457  block_mounts(zone_t *zp)
 457  458  {
 458  459          int retval = 0;
 459  460  
 460  461          /*
 461  462           * Since it may block for a long time, block_mounts() shouldn't be
 462  463           * called with zonehash_lock held.
 463  464           */
 464  465          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 465  466          mutex_enter(&zp->zone_mount_lock);
 466  467          while (zp->zone_mounts_in_progress > 0) {
 467  468                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 468  469                          goto signaled;
 469  470          }
 470  471          /*
 471  472           * A negative value of mounts_in_progress indicates that mounts
 472  473           * have been blocked by (-mounts_in_progress) different callers
 473  474           * (remotely possible if two threads enter zone_shutdown at the same
 474  475           * time).
 475  476           */
 476  477          zp->zone_mounts_in_progress--;
 477  478          retval = 1;
 478  479  signaled:
 479  480          mutex_exit(&zp->zone_mount_lock);
 480  481          return (retval);
 481  482  }
 482  483  
 483  484  /*
 484  485   * The VFS layer may progress with new mounts as far as we're concerned.
 485  486   * Allow them to progress if we were the last obstacle.
 486  487   */
 487  488  static void
 488  489  resume_mounts(zone_t *zp)
 489  490  {
 490  491          mutex_enter(&zp->zone_mount_lock);
 491  492          if (++zp->zone_mounts_in_progress == 0)
 492  493                  cv_broadcast(&zp->zone_mount_cv);
 493  494          mutex_exit(&zp->zone_mount_lock);
 494  495  }
 495  496  
 496  497  /*
 497  498   * The VFS layer is busy with a mount; this zone should wait until all
 498  499   * of its mounts are completed to progress.
 499  500   */
 500  501  void
 501  502  mount_in_progress(zone_t *zp)
 502  503  {
 503  504          mutex_enter(&zp->zone_mount_lock);
 504  505          while (zp->zone_mounts_in_progress < 0)
 505  506                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 506  507          zp->zone_mounts_in_progress++;
 507  508          mutex_exit(&zp->zone_mount_lock);
 508  509  }
 509  510  
 510  511  /*
 511  512   * VFS is done with one mount; wake up any waiting block_mounts()
 512  513   * callers if this is the last mount.
 513  514   */
 514  515  void
 515  516  mount_completed(zone_t *zp)
 516  517  {
 517  518          mutex_enter(&zp->zone_mount_lock);
 518  519          if (--zp->zone_mounts_in_progress == 0)
 519  520                  cv_broadcast(&zp->zone_mount_cv);
 520  521          mutex_exit(&zp->zone_mount_lock);
 521  522  }
 522  523  
 523  524  /*
 524  525   * ZSD routines.
 525  526   *
 526  527   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 527  528   * defined by the pthread_key_create() and related interfaces.
 528  529   *
 529  530   * Kernel subsystems may register one or more data items and/or
 530  531   * callbacks to be executed when a zone is created, shutdown, or
 531  532   * destroyed.
 532  533   *
 533  534   * Unlike the thread counterpart, destructor callbacks will be executed
 534  535   * even if the data pointer is NULL and/or there are no constructor
 535  536   * callbacks, so it is the responsibility of such callbacks to check for
 536  537   * NULL data values if necessary.
 537  538   *
 538  539   * The locking strategy and overall picture is as follows:
 539  540   *
 540  541   * When someone calls zone_key_create(), a template ZSD entry is added to the
 541  542   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 542  543   * holding that lock all the existing zones are marked as
 543  544   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 544  545   * zone_zsd list (protected by zone_lock). The global list is updated first
 545  546   * (under zone_key_lock) to make sure that newly created zones use the
 546  547   * most recent list of keys. Then under zonehash_lock we walk the zones
 547  548   * and mark them.  Similar locking is used in zone_key_delete().
 548  549   *
 549  550   * The actual create, shutdown, and destroy callbacks are done without
 550  551   * holding any lock. And zsd_flags are used to ensure that the operations
 551  552   * completed so that when zone_key_create (and zone_create) is done, as well as
 552  553   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 553  554   * are completed.
 554  555   *
 555  556   * When new zones are created constructor callbacks for all registered ZSD
 556  557   * entries will be called. That also uses the above two phases of marking
 557  558   * what needs to be done, and then running the callbacks without holding
 558  559   * any locks.
 559  560   *
 560  561   * The framework does not provide any locking around zone_getspecific() and
 561  562   * zone_setspecific() apart from that needed for internal consistency, so
 562  563   * callers interested in atomic "test-and-set" semantics will need to provide
 563  564   * their own locking.
 564  565   */
 565  566  
 566  567  /*
 567  568   * Helper function to find the zsd_entry associated with the key in the
 568  569   * given list.
 569  570   */
 570  571  static struct zsd_entry *
 571  572  zsd_find(list_t *l, zone_key_t key)
 572  573  {
 573  574          struct zsd_entry *zsd;
 574  575  
 575  576          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 576  577                  if (zsd->zsd_key == key) {
 577  578                          return (zsd);
 578  579                  }
 579  580          }
 580  581          return (NULL);
 581  582  }
 582  583  
 583  584  /*
 584  585   * Helper function to find the zsd_entry associated with the key in the
 585  586   * given list. Move it to the front of the list.
 586  587   */
 587  588  static struct zsd_entry *
 588  589  zsd_find_mru(list_t *l, zone_key_t key)
 589  590  {
 590  591          struct zsd_entry *zsd;
 591  592  
 592  593          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 593  594                  if (zsd->zsd_key == key) {
 594  595                          /*
 595  596                           * Move to head of list to keep list in MRU order.
 596  597                           */
 597  598                          if (zsd != list_head(l)) {
 598  599                                  list_remove(l, zsd);
 599  600                                  list_insert_head(l, zsd);
 600  601                          }
 601  602                          return (zsd);
 602  603                  }
 603  604          }
 604  605          return (NULL);
 605  606  }
 606  607  
 607  608  void
 608  609  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 609  610      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 610  611  {
 611  612          struct zsd_entry *zsdp;
 612  613          struct zsd_entry *t;
 613  614          struct zone *zone;
 614  615          zone_key_t  key;
 615  616  
 616  617          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 617  618          zsdp->zsd_data = NULL;
 618  619          zsdp->zsd_create = create;
 619  620          zsdp->zsd_shutdown = shutdown;
 620  621          zsdp->zsd_destroy = destroy;
 621  622  
 622  623          /*
 623  624           * Insert in global list of callbacks. Makes future zone creations
 624  625           * see it.
 625  626           */
 626  627          mutex_enter(&zsd_key_lock);
 627  628          key = zsdp->zsd_key = ++zsd_keyval;
 628  629          ASSERT(zsd_keyval != 0);
 629  630          list_insert_tail(&zsd_registered_keys, zsdp);
 630  631          mutex_exit(&zsd_key_lock);
 631  632  
 632  633          /*
 633  634           * Insert for all existing zones and mark them as needing
 634  635           * a create callback.
 635  636           */
 636  637          mutex_enter(&zonehash_lock);    /* stop the world */
 637  638          for (zone = list_head(&zone_active); zone != NULL;
 638  639              zone = list_next(&zone_active, zone)) {
 639  640                  zone_status_t status;
 640  641  
 641  642                  mutex_enter(&zone->zone_lock);
 642  643  
 643  644                  /* Skip zones that are on the way down or not yet up */
 644  645                  status = zone_status_get(zone);
 645  646                  if (status >= ZONE_IS_DOWN ||
 646  647                      status == ZONE_IS_UNINITIALIZED) {
 647  648                          mutex_exit(&zone->zone_lock);
 648  649                          continue;
 649  650                  }
 650  651  
 651  652                  t = zsd_find_mru(&zone->zone_zsd, key);
 652  653                  if (t != NULL) {
 653  654                          /*
 654  655                           * A zsd_configure already inserted it after
 655  656                           * we dropped zsd_key_lock above.
 656  657                           */
 657  658                          mutex_exit(&zone->zone_lock);
 658  659                          continue;
 659  660                  }
 660  661                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 661  662                  t->zsd_key = key;
 662  663                  t->zsd_create = create;
 663  664                  t->zsd_shutdown = shutdown;
 664  665                  t->zsd_destroy = destroy;
 665  666                  if (create != NULL) {
 666  667                          t->zsd_flags = ZSD_CREATE_NEEDED;
 667  668                          DTRACE_PROBE2(zsd__create__needed,
 668  669                              zone_t *, zone, zone_key_t, key);
 669  670                  }
 670  671                  list_insert_tail(&zone->zone_zsd, t);
 671  672                  mutex_exit(&zone->zone_lock);
 672  673          }
 673  674          mutex_exit(&zonehash_lock);
 674  675  
 675  676          if (create != NULL) {
 676  677                  /* Now call the create callback for this key */
 677  678                  zsd_apply_all_zones(zsd_apply_create, key);
 678  679          }
 679  680          /*
 680  681           * It is safe for consumers to use the key now, make it
 681  682           * globally visible. Specifically zone_getspecific() will
 682  683           * always successfully return the zone specific data associated
 683  684           * with the key.
 684  685           */
 685  686          *keyp = key;
 686  687  
 687  688  }
 688  689  
 689  690  /*
 690  691   * Function called when a module is being unloaded, or otherwise wishes
 691  692   * to unregister its ZSD key and callbacks.
 692  693   *
 693  694   * Remove from the global list and determine the functions that need to
 694  695   * be called under a global lock. Then call the functions without
 695  696   * holding any locks. Finally free up the zone_zsd entries. (The apply
 696  697   * functions need to access the zone_zsd entries to find zsd_data etc.)
 697  698   */
 698  699  int
 699  700  zone_key_delete(zone_key_t key)
 700  701  {
 701  702          struct zsd_entry *zsdp = NULL;
 702  703          zone_t *zone;
 703  704  
 704  705          mutex_enter(&zsd_key_lock);
 705  706          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 706  707          if (zsdp == NULL) {
 707  708                  mutex_exit(&zsd_key_lock);
 708  709                  return (-1);
 709  710          }
 710  711          list_remove(&zsd_registered_keys, zsdp);
 711  712          mutex_exit(&zsd_key_lock);
 712  713  
 713  714          mutex_enter(&zonehash_lock);
 714  715          for (zone = list_head(&zone_active); zone != NULL;
 715  716              zone = list_next(&zone_active, zone)) {
 716  717                  struct zsd_entry *del;
 717  718  
 718  719                  mutex_enter(&zone->zone_lock);
 719  720                  del = zsd_find_mru(&zone->zone_zsd, key);
 720  721                  if (del == NULL) {
 721  722                          /*
 722  723                           * Somebody else got here first e.g the zone going
 723  724                           * away.
 724  725                           */
 725  726                          mutex_exit(&zone->zone_lock);
 726  727                          continue;
 727  728                  }
 728  729                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 729  730                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 730  731                  if (del->zsd_shutdown != NULL &&
 731  732                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 732  733                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 733  734                          DTRACE_PROBE2(zsd__shutdown__needed,
 734  735                              zone_t *, zone, zone_key_t, key);
 735  736                  }
 736  737                  if (del->zsd_destroy != NULL &&
 737  738                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 738  739                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 739  740                          DTRACE_PROBE2(zsd__destroy__needed,
 740  741                              zone_t *, zone, zone_key_t, key);
 741  742                  }
 742  743                  mutex_exit(&zone->zone_lock);
 743  744          }
 744  745          mutex_exit(&zonehash_lock);
 745  746          kmem_free(zsdp, sizeof (*zsdp));
 746  747  
 747  748          /* Now call the shutdown and destroy callback for this key */
 748  749          zsd_apply_all_zones(zsd_apply_shutdown, key);
 749  750          zsd_apply_all_zones(zsd_apply_destroy, key);
 750  751  
 751  752          /* Now we can free up the zsdp structures in each zone */
 752  753          mutex_enter(&zonehash_lock);
 753  754          for (zone = list_head(&zone_active); zone != NULL;
 754  755              zone = list_next(&zone_active, zone)) {
 755  756                  struct zsd_entry *del;
 756  757  
 757  758                  mutex_enter(&zone->zone_lock);
 758  759                  del = zsd_find(&zone->zone_zsd, key);
 759  760                  if (del != NULL) {
 760  761                          list_remove(&zone->zone_zsd, del);
 761  762                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 762  763                          kmem_free(del, sizeof (*del));
 763  764                  }
 764  765                  mutex_exit(&zone->zone_lock);
 765  766          }
 766  767          mutex_exit(&zonehash_lock);
 767  768  
 768  769          return (0);
 769  770  }
 770  771  
 771  772  /*
 772  773   * ZSD counterpart of pthread_setspecific().
 773  774   *
 774  775   * Since all zsd callbacks, including those with no create function,
 775  776   * have an entry in zone_zsd, if the key is registered it is part of
 776  777   * the zone_zsd list.
 777  778   * Return an error if the key wasn't registerd.
 778  779   */
 779  780  int
 780  781  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 781  782  {
 782  783          struct zsd_entry *t;
 783  784  
 784  785          mutex_enter(&zone->zone_lock);
 785  786          t = zsd_find_mru(&zone->zone_zsd, key);
 786  787          if (t != NULL) {
 787  788                  /*
 788  789                   * Replace old value with new
 789  790                   */
 790  791                  t->zsd_data = (void *)data;
 791  792                  mutex_exit(&zone->zone_lock);
 792  793                  return (0);
 793  794          }
 794  795          mutex_exit(&zone->zone_lock);
 795  796          return (-1);
 796  797  }
 797  798  
 798  799  /*
 799  800   * ZSD counterpart of pthread_getspecific().
 800  801   */
 801  802  void *
 802  803  zone_getspecific(zone_key_t key, zone_t *zone)
 803  804  {
 804  805          struct zsd_entry *t;
 805  806          void *data;
 806  807  
 807  808          mutex_enter(&zone->zone_lock);
 808  809          t = zsd_find_mru(&zone->zone_zsd, key);
 809  810          data = (t == NULL ? NULL : t->zsd_data);
 810  811          mutex_exit(&zone->zone_lock);
 811  812          return (data);
 812  813  }
 813  814  
 814  815  /*
 815  816   * Function used to initialize a zone's list of ZSD callbacks and data
 816  817   * when the zone is being created.  The callbacks are initialized from
 817  818   * the template list (zsd_registered_keys). The constructor callback is
 818  819   * executed later (once the zone exists and with locks dropped).
 819  820   */
 820  821  static void
 821  822  zone_zsd_configure(zone_t *zone)
 822  823  {
 823  824          struct zsd_entry *zsdp;
 824  825          struct zsd_entry *t;
 825  826  
 826  827          ASSERT(MUTEX_HELD(&zonehash_lock));
 827  828          ASSERT(list_head(&zone->zone_zsd) == NULL);
 828  829          mutex_enter(&zone->zone_lock);
 829  830          mutex_enter(&zsd_key_lock);
 830  831          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 831  832              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 832  833                  /*
 833  834                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 834  835                   * should not have added anything to it.
 835  836                   */
 836  837                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 837  838  
 838  839                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 839  840                  t->zsd_key = zsdp->zsd_key;
 840  841                  t->zsd_create = zsdp->zsd_create;
 841  842                  t->zsd_shutdown = zsdp->zsd_shutdown;
 842  843                  t->zsd_destroy = zsdp->zsd_destroy;
 843  844                  if (zsdp->zsd_create != NULL) {
 844  845                          t->zsd_flags = ZSD_CREATE_NEEDED;
 845  846                          DTRACE_PROBE2(zsd__create__needed,
 846  847                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 847  848                  }
 848  849                  list_insert_tail(&zone->zone_zsd, t);
 849  850          }
 850  851          mutex_exit(&zsd_key_lock);
 851  852          mutex_exit(&zone->zone_lock);
 852  853  }
 853  854  
 854  855  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 855  856  
 856  857  /*
 857  858   * Helper function to execute shutdown or destructor callbacks.
 858  859   */
 859  860  static void
 860  861  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 861  862  {
 862  863          struct zsd_entry *t;
 863  864  
 864  865          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 865  866          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 866  867          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 867  868  
 868  869          /*
 869  870           * Run the callback solely based on what is registered for the zone
 870  871           * in zone_zsd. The global list can change independently of this
 871  872           * as keys are registered and unregistered and we don't register new
 872  873           * callbacks for a zone that is in the process of going away.
 873  874           */
 874  875          mutex_enter(&zone->zone_lock);
 875  876          for (t = list_head(&zone->zone_zsd); t != NULL;
 876  877              t = list_next(&zone->zone_zsd, t)) {
 877  878                  zone_key_t key = t->zsd_key;
 878  879  
 879  880                  /* Skip if no callbacks registered */
 880  881  
 881  882                  if (ct == ZSD_SHUTDOWN) {
 882  883                          if (t->zsd_shutdown != NULL &&
 883  884                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 884  885                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 885  886                                  DTRACE_PROBE2(zsd__shutdown__needed,
 886  887                                      zone_t *, zone, zone_key_t, key);
 887  888                          }
 888  889                  } else {
 889  890                          if (t->zsd_destroy != NULL &&
 890  891                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 891  892                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 892  893                                  DTRACE_PROBE2(zsd__destroy__needed,
 893  894                                      zone_t *, zone, zone_key_t, key);
 894  895                          }
 895  896                  }
 896  897          }
 897  898          mutex_exit(&zone->zone_lock);
 898  899  
 899  900          /* Now call the shutdown and destroy callback for this key */
 900  901          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 901  902          zsd_apply_all_keys(zsd_apply_destroy, zone);
 902  903  
 903  904  }
 904  905  
 905  906  /*
 906  907   * Called when the zone is going away; free ZSD-related memory, and
 907  908   * destroy the zone_zsd list.
 908  909   */
 909  910  static void
 910  911  zone_free_zsd(zone_t *zone)
 911  912  {
 912  913          struct zsd_entry *t, *next;
 913  914  
 914  915          /*
 915  916           * Free all the zsd_entry's we had on this zone.
 916  917           */
 917  918          mutex_enter(&zone->zone_lock);
 918  919          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 919  920                  next = list_next(&zone->zone_zsd, t);
 920  921                  list_remove(&zone->zone_zsd, t);
 921  922                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 922  923                  kmem_free(t, sizeof (*t));
 923  924          }
 924  925          list_destroy(&zone->zone_zsd);
 925  926          mutex_exit(&zone->zone_lock);
 926  927  
 927  928  }
 928  929  
 929  930  /*
 930  931   * Apply a function to all zones for particular key value.
 931  932   *
 932  933   * The applyfn has to drop zonehash_lock if it does some work, and
 933  934   * then reacquire it before it returns.
 934  935   * When the lock is dropped we don't follow list_next even
 935  936   * if it is possible to do so without any hazards. This is
 936  937   * because we want the design to allow for the list of zones
 937  938   * to change in any arbitrary way during the time the
 938  939   * lock was dropped.
 939  940   *
 940  941   * It is safe to restart the loop at list_head since the applyfn
 941  942   * changes the zsd_flags as it does work, so a subsequent
 942  943   * pass through will have no effect in applyfn, hence the loop will terminate
 943  944   * in at worst O(N^2).
 944  945   */
 945  946  static void
 946  947  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 947  948  {
 948  949          zone_t *zone;
 949  950  
 950  951          mutex_enter(&zonehash_lock);
 951  952          zone = list_head(&zone_active);
 952  953          while (zone != NULL) {
 953  954                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 954  955                          /* Lock dropped - restart at head */
 955  956                          zone = list_head(&zone_active);
 956  957                  } else {
 957  958                          zone = list_next(&zone_active, zone);
 958  959                  }
 959  960          }
 960  961          mutex_exit(&zonehash_lock);
 961  962  }
 962  963  
 963  964  /*
 964  965   * Apply a function to all keys for a particular zone.
 965  966   *
 966  967   * The applyfn has to drop zonehash_lock if it does some work, and
 967  968   * then reacquire it before it returns.
 968  969   * When the lock is dropped we don't follow list_next even
 969  970   * if it is possible to do so without any hazards. This is
 970  971   * because we want the design to allow for the list of zsd callbacks
 971  972   * to change in any arbitrary way during the time the
 972  973   * lock was dropped.
 973  974   *
 974  975   * It is safe to restart the loop at list_head since the applyfn
 975  976   * changes the zsd_flags as it does work, so a subsequent
 976  977   * pass through will have no effect in applyfn, hence the loop will terminate
 977  978   * in at worst O(N^2).
 978  979   */
 979  980  static void
 980  981  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 981  982  {
 982  983          struct zsd_entry *t;
 983  984  
 984  985          mutex_enter(&zone->zone_lock);
 985  986          t = list_head(&zone->zone_zsd);
 986  987          while (t != NULL) {
 987  988                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 988  989                          /* Lock dropped - restart at head */
 989  990                          t = list_head(&zone->zone_zsd);
 990  991                  } else {
 991  992                          t = list_next(&zone->zone_zsd, t);
 992  993                  }
 993  994          }
 994  995          mutex_exit(&zone->zone_lock);
 995  996  }
 996  997  
 997  998  /*
 998  999   * Call the create function for the zone and key if CREATE_NEEDED
 999 1000   * is set.
1000 1001   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1001 1002   * we wait for that thread to complete so that we can ensure that
1002 1003   * all the callbacks are done when we've looped over all zones/keys.
1003 1004   *
1004 1005   * When we call the create function, we drop the global held by the
1005 1006   * caller, and return true to tell the caller it needs to re-evalute the
1006 1007   * state.
1007 1008   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1008 1009   * remains held on exit.
1009 1010   */
1010 1011  static boolean_t
1011 1012  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1012 1013      zone_t *zone, zone_key_t key)
1013 1014  {
1014 1015          void *result;
1015 1016          struct zsd_entry *t;
1016 1017          boolean_t dropped;
1017 1018  
1018 1019          if (lockp != NULL) {
1019 1020                  ASSERT(MUTEX_HELD(lockp));
1020 1021          }
1021 1022          if (zone_lock_held) {
1022 1023                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1023 1024          } else {
1024 1025                  mutex_enter(&zone->zone_lock);
1025 1026          }
1026 1027  
1027 1028          t = zsd_find(&zone->zone_zsd, key);
1028 1029          if (t == NULL) {
1029 1030                  /*
1030 1031                   * Somebody else got here first e.g the zone going
1031 1032                   * away.
1032 1033                   */
1033 1034                  if (!zone_lock_held)
1034 1035                          mutex_exit(&zone->zone_lock);
1035 1036                  return (B_FALSE);
1036 1037          }
1037 1038          dropped = B_FALSE;
1038 1039          if (zsd_wait_for_inprogress(zone, t, lockp))
1039 1040                  dropped = B_TRUE;
1040 1041  
1041 1042          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1042 1043                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1043 1044                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1044 1045                  DTRACE_PROBE2(zsd__create__inprogress,
1045 1046                      zone_t *, zone, zone_key_t, key);
1046 1047                  mutex_exit(&zone->zone_lock);
1047 1048                  if (lockp != NULL)
1048 1049                          mutex_exit(lockp);
1049 1050  
1050 1051                  dropped = B_TRUE;
1051 1052                  ASSERT(t->zsd_create != NULL);
1052 1053                  DTRACE_PROBE2(zsd__create__start,
1053 1054                      zone_t *, zone, zone_key_t, key);
1054 1055  
1055 1056                  result = (*t->zsd_create)(zone->zone_id);
1056 1057  
1057 1058                  DTRACE_PROBE2(zsd__create__end,
1058 1059                      zone_t *, zone, voidn *, result);
1059 1060  
1060 1061                  ASSERT(result != NULL);
1061 1062                  if (lockp != NULL)
1062 1063                          mutex_enter(lockp);
1063 1064                  mutex_enter(&zone->zone_lock);
1064 1065                  t->zsd_data = result;
1065 1066                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1066 1067                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1067 1068                  cv_broadcast(&t->zsd_cv);
1068 1069                  DTRACE_PROBE2(zsd__create__completed,
1069 1070                      zone_t *, zone, zone_key_t, key);
1070 1071          }
1071 1072          if (!zone_lock_held)
1072 1073                  mutex_exit(&zone->zone_lock);
1073 1074          return (dropped);
1074 1075  }
1075 1076  
1076 1077  /*
1077 1078   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1078 1079   * is set.
1079 1080   * If some other thread gets here first and sets *_INPROGRESS, then
1080 1081   * we wait for that thread to complete so that we can ensure that
1081 1082   * all the callbacks are done when we've looped over all zones/keys.
1082 1083   *
1083 1084   * When we call the shutdown function, we drop the global held by the
1084 1085   * caller, and return true to tell the caller it needs to re-evalute the
1085 1086   * state.
1086 1087   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1087 1088   * remains held on exit.
1088 1089   */
1089 1090  static boolean_t
1090 1091  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1091 1092      zone_t *zone, zone_key_t key)
1092 1093  {
1093 1094          struct zsd_entry *t;
1094 1095          void *data;
1095 1096          boolean_t dropped;
1096 1097  
1097 1098          if (lockp != NULL) {
1098 1099                  ASSERT(MUTEX_HELD(lockp));
1099 1100          }
1100 1101          if (zone_lock_held) {
1101 1102                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1102 1103          } else {
1103 1104                  mutex_enter(&zone->zone_lock);
1104 1105          }
1105 1106  
1106 1107          t = zsd_find(&zone->zone_zsd, key);
1107 1108          if (t == NULL) {
1108 1109                  /*
1109 1110                   * Somebody else got here first e.g the zone going
1110 1111                   * away.
1111 1112                   */
1112 1113                  if (!zone_lock_held)
1113 1114                          mutex_exit(&zone->zone_lock);
1114 1115                  return (B_FALSE);
1115 1116          }
1116 1117          dropped = B_FALSE;
1117 1118          if (zsd_wait_for_creator(zone, t, lockp))
1118 1119                  dropped = B_TRUE;
1119 1120  
1120 1121          if (zsd_wait_for_inprogress(zone, t, lockp))
1121 1122                  dropped = B_TRUE;
1122 1123  
1123 1124          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1124 1125                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1125 1126                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1126 1127                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1127 1128                      zone_t *, zone, zone_key_t, key);
1128 1129                  mutex_exit(&zone->zone_lock);
1129 1130                  if (lockp != NULL)
1130 1131                          mutex_exit(lockp);
1131 1132                  dropped = B_TRUE;
1132 1133  
1133 1134                  ASSERT(t->zsd_shutdown != NULL);
1134 1135                  data = t->zsd_data;
1135 1136  
1136 1137                  DTRACE_PROBE2(zsd__shutdown__start,
1137 1138                      zone_t *, zone, zone_key_t, key);
1138 1139  
1139 1140                  (t->zsd_shutdown)(zone->zone_id, data);
1140 1141                  DTRACE_PROBE2(zsd__shutdown__end,
1141 1142                      zone_t *, zone, zone_key_t, key);
1142 1143  
1143 1144                  if (lockp != NULL)
1144 1145                          mutex_enter(lockp);
1145 1146                  mutex_enter(&zone->zone_lock);
1146 1147                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1147 1148                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1148 1149                  cv_broadcast(&t->zsd_cv);
1149 1150                  DTRACE_PROBE2(zsd__shutdown__completed,
1150 1151                      zone_t *, zone, zone_key_t, key);
1151 1152          }
1152 1153          if (!zone_lock_held)
1153 1154                  mutex_exit(&zone->zone_lock);
1154 1155          return (dropped);
1155 1156  }
1156 1157  
1157 1158  /*
1158 1159   * Call the destroy function for the zone and key if DESTROY_NEEDED
1159 1160   * is set.
1160 1161   * If some other thread gets here first and sets *_INPROGRESS, then
1161 1162   * we wait for that thread to complete so that we can ensure that
1162 1163   * all the callbacks are done when we've looped over all zones/keys.
1163 1164   *
1164 1165   * When we call the destroy function, we drop the global held by the
1165 1166   * caller, and return true to tell the caller it needs to re-evalute the
1166 1167   * state.
1167 1168   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1168 1169   * remains held on exit.
1169 1170   */
1170 1171  static boolean_t
1171 1172  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1172 1173      zone_t *zone, zone_key_t key)
1173 1174  {
1174 1175          struct zsd_entry *t;
1175 1176          void *data;
1176 1177          boolean_t dropped;
1177 1178  
1178 1179          if (lockp != NULL) {
1179 1180                  ASSERT(MUTEX_HELD(lockp));
1180 1181          }
1181 1182          if (zone_lock_held) {
1182 1183                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1183 1184          } else {
1184 1185                  mutex_enter(&zone->zone_lock);
1185 1186          }
1186 1187  
1187 1188          t = zsd_find(&zone->zone_zsd, key);
1188 1189          if (t == NULL) {
1189 1190                  /*
1190 1191                   * Somebody else got here first e.g the zone going
1191 1192                   * away.
1192 1193                   */
1193 1194                  if (!zone_lock_held)
1194 1195                          mutex_exit(&zone->zone_lock);
1195 1196                  return (B_FALSE);
1196 1197          }
1197 1198          dropped = B_FALSE;
1198 1199          if (zsd_wait_for_creator(zone, t, lockp))
1199 1200                  dropped = B_TRUE;
1200 1201  
1201 1202          if (zsd_wait_for_inprogress(zone, t, lockp))
1202 1203                  dropped = B_TRUE;
1203 1204  
1204 1205          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1205 1206                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1206 1207                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1207 1208                  DTRACE_PROBE2(zsd__destroy__inprogress,
1208 1209                      zone_t *, zone, zone_key_t, key);
1209 1210                  mutex_exit(&zone->zone_lock);
1210 1211                  if (lockp != NULL)
1211 1212                          mutex_exit(lockp);
1212 1213                  dropped = B_TRUE;
1213 1214  
1214 1215                  ASSERT(t->zsd_destroy != NULL);
1215 1216                  data = t->zsd_data;
1216 1217                  DTRACE_PROBE2(zsd__destroy__start,
1217 1218                      zone_t *, zone, zone_key_t, key);
1218 1219  
1219 1220                  (t->zsd_destroy)(zone->zone_id, data);
1220 1221                  DTRACE_PROBE2(zsd__destroy__end,
1221 1222                      zone_t *, zone, zone_key_t, key);
1222 1223  
1223 1224                  if (lockp != NULL)
1224 1225                          mutex_enter(lockp);
1225 1226                  mutex_enter(&zone->zone_lock);
1226 1227                  t->zsd_data = NULL;
1227 1228                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1228 1229                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1229 1230                  cv_broadcast(&t->zsd_cv);
1230 1231                  DTRACE_PROBE2(zsd__destroy__completed,
1231 1232                      zone_t *, zone, zone_key_t, key);
1232 1233          }
1233 1234          if (!zone_lock_held)
1234 1235                  mutex_exit(&zone->zone_lock);
1235 1236          return (dropped);
1236 1237  }
1237 1238  
1238 1239  /*
1239 1240   * Wait for any CREATE_NEEDED flag to be cleared.
1240 1241   * Returns true if lockp was temporarily dropped while waiting.
1241 1242   */
1242 1243  static boolean_t
1243 1244  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1244 1245  {
1245 1246          boolean_t dropped = B_FALSE;
1246 1247  
1247 1248          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1248 1249                  DTRACE_PROBE2(zsd__wait__for__creator,
1249 1250                      zone_t *, zone, struct zsd_entry *, t);
1250 1251                  if (lockp != NULL) {
1251 1252                          dropped = B_TRUE;
1252 1253                          mutex_exit(lockp);
1253 1254                  }
1254 1255                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1255 1256                  if (lockp != NULL) {
1256 1257                          /* First drop zone_lock to preserve order */
1257 1258                          mutex_exit(&zone->zone_lock);
1258 1259                          mutex_enter(lockp);
1259 1260                          mutex_enter(&zone->zone_lock);
1260 1261                  }
1261 1262          }
1262 1263          return (dropped);
1263 1264  }
1264 1265  
1265 1266  /*
1266 1267   * Wait for any INPROGRESS flag to be cleared.
1267 1268   * Returns true if lockp was temporarily dropped while waiting.
1268 1269   */
1269 1270  static boolean_t
1270 1271  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1271 1272  {
1272 1273          boolean_t dropped = B_FALSE;
1273 1274  
1274 1275          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1275 1276                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1276 1277                      zone_t *, zone, struct zsd_entry *, t);
1277 1278                  if (lockp != NULL) {
1278 1279                          dropped = B_TRUE;
1279 1280                          mutex_exit(lockp);
1280 1281                  }
1281 1282                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1282 1283                  if (lockp != NULL) {
1283 1284                          /* First drop zone_lock to preserve order */
1284 1285                          mutex_exit(&zone->zone_lock);
1285 1286                          mutex_enter(lockp);
1286 1287                          mutex_enter(&zone->zone_lock);
1287 1288                  }
1288 1289          }
1289 1290          return (dropped);
1290 1291  }
1291 1292  
1292 1293  /*
1293 1294   * Frees memory associated with the zone dataset list.
1294 1295   */
1295 1296  static void
1296 1297  zone_free_datasets(zone_t *zone)
1297 1298  {
1298 1299          zone_dataset_t *t, *next;
1299 1300  
1300 1301          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1301 1302                  next = list_next(&zone->zone_datasets, t);
1302 1303                  list_remove(&zone->zone_datasets, t);
1303 1304                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1304 1305                  kmem_free(t, sizeof (*t));
1305 1306          }
1306 1307          list_destroy(&zone->zone_datasets);
1307 1308  }
1308 1309  
1309 1310  /*
1310 1311   * zone.cpu-shares resource control support.
1311 1312   */
1312 1313  /*ARGSUSED*/
1313 1314  static rctl_qty_t
1314 1315  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1315 1316  {
1316 1317          ASSERT(MUTEX_HELD(&p->p_lock));
1317 1318          return (p->p_zone->zone_shares);
1318 1319  }
1319 1320  
1320 1321  /*ARGSUSED*/
1321 1322  static int
1322 1323  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1323 1324      rctl_qty_t nv)
1324 1325  {
1325 1326          ASSERT(MUTEX_HELD(&p->p_lock));
1326 1327          ASSERT(e->rcep_t == RCENTITY_ZONE);
1327 1328          if (e->rcep_p.zone == NULL)
1328 1329                  return (0);
1329 1330  
1330 1331          e->rcep_p.zone->zone_shares = nv;
1331 1332          return (0);
1332 1333  }
1333 1334  
1334 1335  static rctl_ops_t zone_cpu_shares_ops = {
1335 1336          rcop_no_action,
1336 1337          zone_cpu_shares_usage,
1337 1338          zone_cpu_shares_set,
1338 1339          rcop_no_test
1339 1340  };
1340 1341  
1341 1342  /*
1342 1343   * zone.cpu-cap resource control support.
1343 1344   */
1344 1345  /*ARGSUSED*/
1345 1346  static rctl_qty_t
1346 1347  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1347 1348  {
1348 1349          ASSERT(MUTEX_HELD(&p->p_lock));
1349 1350          return (cpucaps_zone_get(p->p_zone));
1350 1351  }
1351 1352  
1352 1353  /*ARGSUSED*/
1353 1354  static int
1354 1355  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1355 1356      rctl_qty_t nv)
1356 1357  {
1357 1358          zone_t *zone = e->rcep_p.zone;
1358 1359  
1359 1360          ASSERT(MUTEX_HELD(&p->p_lock));
1360 1361          ASSERT(e->rcep_t == RCENTITY_ZONE);
1361 1362  
1362 1363          if (zone == NULL)
1363 1364                  return (0);
1364 1365  
1365 1366          /*
1366 1367           * set cap to the new value.
1367 1368           */
1368 1369          return (cpucaps_zone_set(zone, nv));
1369 1370  }
1370 1371  
1371 1372  static rctl_ops_t zone_cpu_cap_ops = {
1372 1373          rcop_no_action,
1373 1374          zone_cpu_cap_get,
1374 1375          zone_cpu_cap_set,
1375 1376          rcop_no_test
1376 1377  };
1377 1378  
1378 1379  /*ARGSUSED*/
1379 1380  static rctl_qty_t
1380 1381  zone_lwps_usage(rctl_t *r, proc_t *p)
1381 1382  {
1382 1383          rctl_qty_t nlwps;
1383 1384          zone_t *zone = p->p_zone;
1384 1385  
1385 1386          ASSERT(MUTEX_HELD(&p->p_lock));
1386 1387  
1387 1388          mutex_enter(&zone->zone_nlwps_lock);
1388 1389          nlwps = zone->zone_nlwps;
1389 1390          mutex_exit(&zone->zone_nlwps_lock);
1390 1391  
1391 1392          return (nlwps);
1392 1393  }
1393 1394  
1394 1395  /*ARGSUSED*/
1395 1396  static int
1396 1397  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1397 1398      rctl_qty_t incr, uint_t flags)
1398 1399  {
1399 1400          rctl_qty_t nlwps;
1400 1401  
1401 1402          ASSERT(MUTEX_HELD(&p->p_lock));
1402 1403          ASSERT(e->rcep_t == RCENTITY_ZONE);
1403 1404          if (e->rcep_p.zone == NULL)
1404 1405                  return (0);
1405 1406          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1406 1407          nlwps = e->rcep_p.zone->zone_nlwps;
1407 1408  
1408 1409          if (nlwps + incr > rcntl->rcv_value)
1409 1410                  return (1);
1410 1411  
1411 1412          return (0);
1412 1413  }
1413 1414  
1414 1415  /*ARGSUSED*/
1415 1416  static int
1416 1417  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1417 1418  {
1418 1419          ASSERT(MUTEX_HELD(&p->p_lock));
1419 1420          ASSERT(e->rcep_t == RCENTITY_ZONE);
1420 1421          if (e->rcep_p.zone == NULL)
1421 1422                  return (0);
1422 1423          e->rcep_p.zone->zone_nlwps_ctl = nv;
1423 1424          return (0);
1424 1425  }
1425 1426  
1426 1427  static rctl_ops_t zone_lwps_ops = {
1427 1428          rcop_no_action,
1428 1429          zone_lwps_usage,
1429 1430          zone_lwps_set,
1430 1431          zone_lwps_test,
1431 1432  };
1432 1433  
1433 1434  /*ARGSUSED*/
1434 1435  static rctl_qty_t
1435 1436  zone_procs_usage(rctl_t *r, proc_t *p)
1436 1437  {
1437 1438          rctl_qty_t nprocs;
1438 1439          zone_t *zone = p->p_zone;
1439 1440  
1440 1441          ASSERT(MUTEX_HELD(&p->p_lock));
1441 1442  
1442 1443          mutex_enter(&zone->zone_nlwps_lock);
1443 1444          nprocs = zone->zone_nprocs;
1444 1445          mutex_exit(&zone->zone_nlwps_lock);
1445 1446  
1446 1447          return (nprocs);
1447 1448  }
1448 1449  
1449 1450  /*ARGSUSED*/
1450 1451  static int
1451 1452  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1452 1453      rctl_qty_t incr, uint_t flags)
1453 1454  {
1454 1455          rctl_qty_t nprocs;
1455 1456  
1456 1457          ASSERT(MUTEX_HELD(&p->p_lock));
1457 1458          ASSERT(e->rcep_t == RCENTITY_ZONE);
1458 1459          if (e->rcep_p.zone == NULL)
1459 1460                  return (0);
1460 1461          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1461 1462          nprocs = e->rcep_p.zone->zone_nprocs;
1462 1463  
1463 1464          if (nprocs + incr > rcntl->rcv_value)
1464 1465                  return (1);
1465 1466  
1466 1467          return (0);
1467 1468  }
1468 1469  
1469 1470  /*ARGSUSED*/
1470 1471  static int
1471 1472  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1472 1473  {
1473 1474          ASSERT(MUTEX_HELD(&p->p_lock));
1474 1475          ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 1476          if (e->rcep_p.zone == NULL)
1476 1477                  return (0);
1477 1478          e->rcep_p.zone->zone_nprocs_ctl = nv;
1478 1479          return (0);
1479 1480  }
1480 1481  
1481 1482  static rctl_ops_t zone_procs_ops = {
1482 1483          rcop_no_action,
1483 1484          zone_procs_usage,
1484 1485          zone_procs_set,
1485 1486          zone_procs_test,
1486 1487  };
1487 1488  
1488 1489  /*ARGSUSED*/
1489 1490  static int
1490 1491  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1491 1492      rctl_qty_t incr, uint_t flags)
1492 1493  {
1493 1494          rctl_qty_t v;
1494 1495          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1496          ASSERT(e->rcep_t == RCENTITY_ZONE);
1496 1497          v = e->rcep_p.zone->zone_shmmax + incr;
1497 1498          if (v > rval->rcv_value)
1498 1499                  return (1);
1499 1500          return (0);
1500 1501  }
1501 1502  
1502 1503  static rctl_ops_t zone_shmmax_ops = {
1503 1504          rcop_no_action,
1504 1505          rcop_no_usage,
1505 1506          rcop_no_set,
1506 1507          zone_shmmax_test
1507 1508  };
1508 1509  
1509 1510  /*ARGSUSED*/
1510 1511  static int
1511 1512  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1512 1513      rctl_qty_t incr, uint_t flags)
1513 1514  {
1514 1515          rctl_qty_t v;
1515 1516          ASSERT(MUTEX_HELD(&p->p_lock));
1516 1517          ASSERT(e->rcep_t == RCENTITY_ZONE);
1517 1518          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1518 1519          if (v > rval->rcv_value)
1519 1520                  return (1);
1520 1521          return (0);
1521 1522  }
1522 1523  
1523 1524  static rctl_ops_t zone_shmmni_ops = {
1524 1525          rcop_no_action,
1525 1526          rcop_no_usage,
1526 1527          rcop_no_set,
1527 1528          zone_shmmni_test
1528 1529  };
1529 1530  
1530 1531  /*ARGSUSED*/
1531 1532  static int
1532 1533  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1533 1534      rctl_qty_t incr, uint_t flags)
1534 1535  {
1535 1536          rctl_qty_t v;
1536 1537          ASSERT(MUTEX_HELD(&p->p_lock));
1537 1538          ASSERT(e->rcep_t == RCENTITY_ZONE);
1538 1539          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1539 1540          if (v > rval->rcv_value)
1540 1541                  return (1);
1541 1542          return (0);
1542 1543  }
1543 1544  
1544 1545  static rctl_ops_t zone_semmni_ops = {
1545 1546          rcop_no_action,
1546 1547          rcop_no_usage,
1547 1548          rcop_no_set,
1548 1549          zone_semmni_test
1549 1550  };
1550 1551  
1551 1552  /*ARGSUSED*/
1552 1553  static int
1553 1554  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1554 1555      rctl_qty_t incr, uint_t flags)
1555 1556  {
1556 1557          rctl_qty_t v;
1557 1558          ASSERT(MUTEX_HELD(&p->p_lock));
1558 1559          ASSERT(e->rcep_t == RCENTITY_ZONE);
1559 1560          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1560 1561          if (v > rval->rcv_value)
1561 1562                  return (1);
1562 1563          return (0);
1563 1564  }
1564 1565  
1565 1566  static rctl_ops_t zone_msgmni_ops = {
1566 1567          rcop_no_action,
1567 1568          rcop_no_usage,
1568 1569          rcop_no_set,
1569 1570          zone_msgmni_test
1570 1571  };
1571 1572  
1572 1573  /*ARGSUSED*/
1573 1574  static rctl_qty_t
1574 1575  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1575 1576  {
1576 1577          rctl_qty_t q;
1577 1578          ASSERT(MUTEX_HELD(&p->p_lock));
1578 1579          mutex_enter(&p->p_zone->zone_mem_lock);
1579 1580          q = p->p_zone->zone_locked_mem;
1580 1581          mutex_exit(&p->p_zone->zone_mem_lock);
1581 1582          return (q);
1582 1583  }
1583 1584  
1584 1585  /*ARGSUSED*/
1585 1586  static int
1586 1587  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1587 1588      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1588 1589  {
1589 1590          rctl_qty_t q;
1590 1591          zone_t *z;
1591 1592  
1592 1593          z = e->rcep_p.zone;
1593 1594          ASSERT(MUTEX_HELD(&p->p_lock));
1594 1595          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1595 1596          q = z->zone_locked_mem;
1596 1597          if (q + incr > rcntl->rcv_value)
1597 1598                  return (1);
1598 1599          return (0);
1599 1600  }
1600 1601  
1601 1602  /*ARGSUSED*/
1602 1603  static int
1603 1604  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1604 1605      rctl_qty_t nv)
1605 1606  {
1606 1607          ASSERT(MUTEX_HELD(&p->p_lock));
1607 1608          ASSERT(e->rcep_t == RCENTITY_ZONE);
1608 1609          if (e->rcep_p.zone == NULL)
1609 1610                  return (0);
1610 1611          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1611 1612          return (0);
1612 1613  }
1613 1614  
1614 1615  static rctl_ops_t zone_locked_mem_ops = {
1615 1616          rcop_no_action,
1616 1617          zone_locked_mem_usage,
1617 1618          zone_locked_mem_set,
1618 1619          zone_locked_mem_test
1619 1620  };
1620 1621  
1621 1622  /*ARGSUSED*/
1622 1623  static rctl_qty_t
1623 1624  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1624 1625  {
1625 1626          rctl_qty_t q;
1626 1627          zone_t *z = p->p_zone;
1627 1628  
1628 1629          ASSERT(MUTEX_HELD(&p->p_lock));
1629 1630          mutex_enter(&z->zone_mem_lock);
1630 1631          q = z->zone_max_swap;
1631 1632          mutex_exit(&z->zone_mem_lock);
1632 1633          return (q);
1633 1634  }
1634 1635  
1635 1636  /*ARGSUSED*/
1636 1637  static int
1637 1638  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1638 1639      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1639 1640  {
1640 1641          rctl_qty_t q;
1641 1642          zone_t *z;
1642 1643  
1643 1644          z = e->rcep_p.zone;
1644 1645          ASSERT(MUTEX_HELD(&p->p_lock));
1645 1646          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1646 1647          q = z->zone_max_swap;
1647 1648          if (q + incr > rcntl->rcv_value)
1648 1649                  return (1);
1649 1650          return (0);
1650 1651  }
1651 1652  
1652 1653  /*ARGSUSED*/
1653 1654  static int
1654 1655  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1655 1656      rctl_qty_t nv)
1656 1657  {
1657 1658          ASSERT(MUTEX_HELD(&p->p_lock));
1658 1659          ASSERT(e->rcep_t == RCENTITY_ZONE);
1659 1660          if (e->rcep_p.zone == NULL)
1660 1661                  return (0);
1661 1662          e->rcep_p.zone->zone_max_swap_ctl = nv;
1662 1663          return (0);
1663 1664  }
1664 1665  
1665 1666  static rctl_ops_t zone_max_swap_ops = {
1666 1667          rcop_no_action,
1667 1668          zone_max_swap_usage,
1668 1669          zone_max_swap_set,
1669 1670          zone_max_swap_test
1670 1671  };
1671 1672  
1672 1673  /*ARGSUSED*/
1673 1674  static rctl_qty_t
1674 1675  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1675 1676  {
1676 1677          rctl_qty_t q;
1677 1678          zone_t *z = p->p_zone;
1678 1679  
1679 1680          ASSERT(MUTEX_HELD(&p->p_lock));
1680 1681          mutex_enter(&z->zone_rctl_lock);
1681 1682          q = z->zone_max_lofi;
1682 1683          mutex_exit(&z->zone_rctl_lock);
1683 1684          return (q);
1684 1685  }
1685 1686  
1686 1687  /*ARGSUSED*/
1687 1688  static int
1688 1689  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1689 1690      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1690 1691  {
1691 1692          rctl_qty_t q;
1692 1693          zone_t *z;
1693 1694  
1694 1695          z = e->rcep_p.zone;
1695 1696          ASSERT(MUTEX_HELD(&p->p_lock));
1696 1697          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1697 1698          q = z->zone_max_lofi;
1698 1699          if (q + incr > rcntl->rcv_value)
1699 1700                  return (1);
1700 1701          return (0);
1701 1702  }
1702 1703  
1703 1704  /*ARGSUSED*/
1704 1705  static int
1705 1706  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1706 1707      rctl_qty_t nv)
1707 1708  {
1708 1709          ASSERT(MUTEX_HELD(&p->p_lock));
1709 1710          ASSERT(e->rcep_t == RCENTITY_ZONE);
1710 1711          if (e->rcep_p.zone == NULL)
1711 1712                  return (0);
1712 1713          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1713 1714          return (0);
1714 1715  }
1715 1716  
1716 1717  static rctl_ops_t zone_max_lofi_ops = {
1717 1718          rcop_no_action,
1718 1719          zone_max_lofi_usage,
1719 1720          zone_max_lofi_set,
1720 1721          zone_max_lofi_test
1721 1722  };
1722 1723  
1723 1724  /*
1724 1725   * Helper function to brand the zone with a unique ID.
1725 1726   */
1726 1727  static void
1727 1728  zone_uniqid(zone_t *zone)
1728 1729  {
1729 1730          static uint64_t uniqid = 0;
1730 1731  
1731 1732          ASSERT(MUTEX_HELD(&zonehash_lock));
1732 1733          zone->zone_uniqid = uniqid++;
1733 1734  }
1734 1735  
1735 1736  /*
1736 1737   * Returns a held pointer to the "kcred" for the specified zone.
1737 1738   */
1738 1739  struct cred *
1739 1740  zone_get_kcred(zoneid_t zoneid)
1740 1741  {
1741 1742          zone_t *zone;
1742 1743          cred_t *cr;
1743 1744  
1744 1745          if ((zone = zone_find_by_id(zoneid)) == NULL)
1745 1746                  return (NULL);
1746 1747          cr = zone->zone_kcred;
1747 1748          crhold(cr);
1748 1749          zone_rele(zone);
1749 1750          return (cr);
1750 1751  }
1751 1752  
1752 1753  static int
1753 1754  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1754 1755  {
1755 1756          zone_t *zone = ksp->ks_private;
1756 1757          zone_kstat_t *zk = ksp->ks_data;
1757 1758  
1758 1759          if (rw == KSTAT_WRITE)
1759 1760                  return (EACCES);
1760 1761  
1761 1762          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1762 1763          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1763 1764          return (0);
1764 1765  }
1765 1766  
1766 1767  static int
1767 1768  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1768 1769  {
1769 1770          zone_t *zone = ksp->ks_private;
1770 1771          zone_kstat_t *zk = ksp->ks_data;
1771 1772  
1772 1773          if (rw == KSTAT_WRITE)
1773 1774                  return (EACCES);
1774 1775  
1775 1776          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1776 1777          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1777 1778          return (0);
1778 1779  }
1779 1780  
1780 1781  static int
1781 1782  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1782 1783  {
1783 1784          zone_t *zone = ksp->ks_private;
1784 1785          zone_kstat_t *zk = ksp->ks_data;
1785 1786  
1786 1787          if (rw == KSTAT_WRITE)
1787 1788                  return (EACCES);
1788 1789  
1789 1790          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1790 1791          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1791 1792          return (0);
1792 1793  }
1793 1794  
1794 1795  static kstat_t *
1795 1796  zone_kstat_create_common(zone_t *zone, char *name,
1796 1797      int (*updatefunc) (kstat_t *, int))
1797 1798  {
1798 1799          kstat_t *ksp;
1799 1800          zone_kstat_t *zk;
1800 1801  
1801 1802          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1802 1803              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1803 1804              KSTAT_FLAG_VIRTUAL);
1804 1805  
1805 1806          if (ksp == NULL)
1806 1807                  return (NULL);
1807 1808  
1808 1809          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1809 1810          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1810 1811          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1811 1812          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1812 1813          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1813 1814          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1814 1815          ksp->ks_update = updatefunc;
1815 1816          ksp->ks_private = zone;
1816 1817          kstat_install(ksp);
1817 1818          return (ksp);
1818 1819  }
1819 1820  
1820 1821  static int
1821 1822  zone_misc_kstat_update(kstat_t *ksp, int rw)
1822 1823  {
1823 1824          zone_t *zone = ksp->ks_private;
1824 1825          zone_misc_kstat_t *zmp = ksp->ks_data;
1825 1826          hrtime_t tmp;
1826 1827  
1827 1828          if (rw == KSTAT_WRITE)
1828 1829                  return (EACCES);
1829 1830

↓ open down ↓

1795 lines elided

↑ open up ↑

1830 1831          tmp = zone->zone_utime;
1831 1832          scalehrtime(&tmp);
1832 1833          zmp->zm_utime.value.ui64 = tmp;
1833 1834          tmp = zone->zone_stime;
1834 1835          scalehrtime(&tmp);
1835 1836          zmp->zm_stime.value.ui64 = tmp;
1836 1837          tmp = zone->zone_wtime;
1837 1838          scalehrtime(&tmp);
1838 1839          zmp->zm_wtime.value.ui64 = tmp;
1839 1840  
     1841 +        zmp->zm_boot_hrtime.value.t = zone->zone_boot_hrtime;
     1842 +
1840 1843          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1841 1844          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1842 1845          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1843 1846  
1844 1847          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1845 1848          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1846 1849          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1847 1850          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1848 1851  
1849 1852          return (0);

1850 1853  }
1851 1854  
1852 1855  static kstat_t *
1853 1856  zone_misc_kstat_create(zone_t *zone)
1854 1857  {
1855 1858          kstat_t *ksp;
1856 1859          zone_misc_kstat_t *zmp;
1857 1860  
1858 1861          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1859 1862              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1860 1863              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1861 1864              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1862 1865                  return (NULL);
1863 1866  
1864 1867          if (zone->zone_id != GLOBAL_ZONEID)
1865 1868                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1866 1869  
1867 1870          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1868 1871          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1869 1872          ksp->ks_lock = &zone->zone_misc_lock;
1870 1873          zone->zone_misc_stats = zmp;
1871 1874  
1872 1875          /* The kstat "name" field is not large enough for a full zonename */
1873 1876          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1874 1877          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1875 1878          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1876 1879          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);

↓ open down ↓

27 lines elided

↑ open up ↑

1877 1880          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1878 1881          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1879 1882          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1880 1883          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1881 1884              KSTAT_DATA_UINT32);
1882 1885          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1883 1886          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1884 1887              KSTAT_DATA_UINT32);
1885 1888          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1886 1889          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
     1890 +        kstat_named_init(&zmp->zm_boot_hrtime, "boot_hrtime", KSTAT_DATA_TIME);
1887 1891  
1888 1892  
1889 1893          ksp->ks_update = zone_misc_kstat_update;
1890 1894          ksp->ks_private = zone;
1891 1895  
1892 1896          kstat_install(ksp);
1893 1897          return (ksp);
1894 1898  }
1895 1899  
1896 1900  static void

1897 1901  zone_kstat_create(zone_t *zone)
1898 1902  {
1899 1903          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1900 1904              "lockedmem", zone_lockedmem_kstat_update);
1901 1905          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1902 1906              "swapresv", zone_swapresv_kstat_update);
1903 1907          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1904 1908              "nprocs", zone_nprocs_kstat_update);
1905 1909  
1906 1910          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
1907 1911                  zone->zone_misc_stats = kmem_zalloc(
1908 1912                      sizeof (zone_misc_kstat_t), KM_SLEEP);
1909 1913          }
1910 1914  }
1911 1915  
1912 1916  static void
1913 1917  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
1914 1918  {
1915 1919          void *data;
1916 1920  
1917 1921          if (*pkstat != NULL) {
1918 1922                  data = (*pkstat)->ks_data;
1919 1923                  kstat_delete(*pkstat);
1920 1924                  kmem_free(data, datasz);
1921 1925                  *pkstat = NULL;
1922 1926          }
1923 1927  }
1924 1928  
1925 1929  static void
1926 1930  zone_kstat_delete(zone_t *zone)
1927 1931  {
1928 1932          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
1929 1933              sizeof (zone_kstat_t));
1930 1934          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
1931 1935              sizeof (zone_kstat_t));
1932 1936          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
1933 1937              sizeof (zone_kstat_t));
1934 1938          zone_kstat_delete_common(&zone->zone_misc_ksp,
1935 1939              sizeof (zone_misc_kstat_t));
1936 1940  }
1937 1941  
1938 1942  /*
1939 1943   * Called very early on in boot to initialize the ZSD list so that
1940 1944   * zone_key_create() can be called before zone_init().  It also initializes
1941 1945   * portions of zone0 which may be used before zone_init() is called.  The
1942 1946   * variable "global_zone" will be set when zone0 is fully initialized by
1943 1947   * zone_init().
1944 1948   */
1945 1949  void
1946 1950  zone_zsd_init(void)
1947 1951  {
1948 1952          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1949 1953          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1950 1954          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1951 1955              offsetof(struct zsd_entry, zsd_linkage));
1952 1956          list_create(&zone_active, sizeof (zone_t),
1953 1957              offsetof(zone_t, zone_linkage));
1954 1958          list_create(&zone_deathrow, sizeof (zone_t),
1955 1959              offsetof(zone_t, zone_linkage));
1956 1960  
1957 1961          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1958 1962          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1959 1963          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1960 1964          zone0.zone_shares = 1;
1961 1965          zone0.zone_nlwps = 0;
1962 1966          zone0.zone_nlwps_ctl = INT_MAX;
1963 1967          zone0.zone_nprocs = 0;
1964 1968          zone0.zone_nprocs_ctl = INT_MAX;
1965 1969          zone0.zone_locked_mem = 0;
1966 1970          zone0.zone_locked_mem_ctl = UINT64_MAX;
1967 1971          ASSERT(zone0.zone_max_swap == 0);
1968 1972          zone0.zone_max_swap_ctl = UINT64_MAX;
1969 1973          zone0.zone_max_lofi = 0;
1970 1974          zone0.zone_max_lofi_ctl = UINT64_MAX;
1971 1975          zone0.zone_shmmax = 0;
1972 1976          zone0.zone_ipc.ipcq_shmmni = 0;
1973 1977          zone0.zone_ipc.ipcq_semmni = 0;
1974 1978          zone0.zone_ipc.ipcq_msgmni = 0;
1975 1979          zone0.zone_name = GLOBAL_ZONENAME;
1976 1980          zone0.zone_nodename = utsname.nodename;
1977 1981          zone0.zone_domain = srpc_domain;
1978 1982          zone0.zone_hostid = HW_INVALID_HOSTID;
1979 1983          zone0.zone_fs_allowed = NULL;
1980 1984          zone0.zone_ref = 1;
1981 1985          zone0.zone_id = GLOBAL_ZONEID;
1982 1986          zone0.zone_status = ZONE_IS_RUNNING;
1983 1987          zone0.zone_rootpath = "/";
1984 1988          zone0.zone_rootpathlen = 2;
1985 1989          zone0.zone_psetid = ZONE_PS_INVAL;
1986 1990          zone0.zone_ncpus = 0;
1987 1991          zone0.zone_ncpus_online = 0;
1988 1992          zone0.zone_proc_initpid = 1;
1989 1993          zone0.zone_initname = initname;
1990 1994          zone0.zone_lockedmem_kstat = NULL;
1991 1995          zone0.zone_swapresv_kstat = NULL;
1992 1996          zone0.zone_nprocs_kstat = NULL;
1993 1997  
1994 1998          zone0.zone_stime = 0;
1995 1999          zone0.zone_utime = 0;
1996 2000          zone0.zone_wtime = 0;
1997 2001  
1998 2002          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
1999 2003              offsetof(zone_ref_t, zref_linkage));
2000 2004          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2001 2005              offsetof(struct zsd_entry, zsd_linkage));
2002 2006          list_insert_head(&zone_active, &zone0);
2003 2007  
2004 2008          /*
2005 2009           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2006 2010           * to anything meaningful.  It is assigned to be 'rootdir' in
2007 2011           * vfs_mountroot().
2008 2012           */
2009 2013          zone0.zone_rootvp = NULL;
2010 2014          zone0.zone_vfslist = NULL;
2011 2015          zone0.zone_bootargs = initargs;
2012 2016          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2013 2017          /*
2014 2018           * The global zone has all privileges
2015 2019           */
2016 2020          priv_fillset(zone0.zone_privset);
2017 2021          /*
2018 2022           * Add p0 to the global zone
2019 2023           */
2020 2024          zone0.zone_zsched = &p0;
2021 2025          p0.p_zone = &zone0;
2022 2026  }
2023 2027  
2024 2028  /*
2025 2029   * Compute a hash value based on the contents of the label and the DOI.  The
2026 2030   * hash algorithm is somewhat arbitrary, but is based on the observation that
2027 2031   * humans will likely pick labels that differ by amounts that work out to be
2028 2032   * multiples of the number of hash chains, and thus stirring in some primes
2029 2033   * should help.
2030 2034   */
2031 2035  static uint_t
2032 2036  hash_bylabel(void *hdata, mod_hash_key_t key)
2033 2037  {
2034 2038          const ts_label_t *lab = (ts_label_t *)key;
2035 2039          const uint32_t *up, *ue;
2036 2040          uint_t hash;
2037 2041          int i;
2038 2042  
2039 2043          _NOTE(ARGUNUSED(hdata));
2040 2044  
2041 2045          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2042 2046          /* we depend on alignment of label, but not representation */
2043 2047          up = (const uint32_t *)&lab->tsl_label;
2044 2048          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2045 2049          i = 1;
2046 2050          while (up < ue) {
2047 2051                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2048 2052                  hash += *up + (*up << ((i % 16) + 1));
2049 2053                  up++;
2050 2054                  i++;
2051 2055          }
2052 2056          return (hash);
2053 2057  }
2054 2058  
2055 2059  /*
2056 2060   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2057 2061   * equal).  This may need to be changed if less than / greater than is ever
2058 2062   * needed.
2059 2063   */
2060 2064  static int
2061 2065  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2062 2066  {
2063 2067          ts_label_t *lab1 = (ts_label_t *)key1;
2064 2068          ts_label_t *lab2 = (ts_label_t *)key2;
2065 2069  
2066 2070          return (label_equal(lab1, lab2) ? 0 : 1);
2067 2071  }
2068 2072  
2069 2073  /*
2070 2074   * Called by main() to initialize the zones framework.
2071 2075   */
2072 2076  void
2073 2077  zone_init(void)
2074 2078  {
2075 2079          rctl_dict_entry_t *rde;
2076 2080          rctl_val_t *dval;
2077 2081          rctl_set_t *set;
2078 2082          rctl_alloc_gp_t *gp;
2079 2083          rctl_entity_p_t e;
2080 2084          int res;
2081 2085  
2082 2086          ASSERT(curproc == &p0);
2083 2087  
2084 2088          /*
2085 2089           * Create ID space for zone IDs.  ID 0 is reserved for the
2086 2090           * global zone.
2087 2091           */
2088 2092          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2089 2093  
2090 2094          /*
2091 2095           * Initialize generic zone resource controls, if any.
2092 2096           */
2093 2097          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2094 2098              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2095 2099              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2096 2100              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2097 2101  
2098 2102          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2099 2103              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2100 2104              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2101 2105              RCTL_GLOBAL_INFINITE,
2102 2106              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2103 2107  
2104 2108          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2105 2109              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2106 2110              INT_MAX, INT_MAX, &zone_lwps_ops);
2107 2111  
2108 2112          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2109 2113              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2110 2114              INT_MAX, INT_MAX, &zone_procs_ops);
2111 2115  
2112 2116          /*
2113 2117           * System V IPC resource controls
2114 2118           */
2115 2119          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2116 2120              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2117 2121              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2118 2122  
2119 2123          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2120 2124              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2121 2125              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2122 2126  
2123 2127          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2124 2128              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2125 2129              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2126 2130  
2127 2131          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2128 2132              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2129 2133              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2130 2134  
2131 2135          /*
2132 2136           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2133 2137           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2134 2138           */
2135 2139          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2136 2140          bzero(dval, sizeof (rctl_val_t));
2137 2141          dval->rcv_value = 1;
2138 2142          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2139 2143          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2140 2144          dval->rcv_action_recip_pid = -1;
2141 2145  
2142 2146          rde = rctl_dict_lookup("zone.cpu-shares");
2143 2147          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2144 2148  
2145 2149          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2146 2150              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2147 2151              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2148 2152              &zone_locked_mem_ops);
2149 2153  
2150 2154          rc_zone_max_swap = rctl_register("zone.max-swap",
2151 2155              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2152 2156              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2153 2157              &zone_max_swap_ops);
2154 2158  
2155 2159          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2156 2160              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2157 2161              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2158 2162              &zone_max_lofi_ops);
2159 2163  
2160 2164          /*
2161 2165           * Initialize the ``global zone''.
2162 2166           */
2163 2167          set = rctl_set_create();
2164 2168          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2165 2169          mutex_enter(&p0.p_lock);
2166 2170          e.rcep_p.zone = &zone0;
2167 2171          e.rcep_t = RCENTITY_ZONE;
2168 2172          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2169 2173              gp);
2170 2174  
2171 2175          zone0.zone_nlwps = p0.p_lwpcnt;
2172 2176          zone0.zone_nprocs = 1;
2173 2177          zone0.zone_ntasks = 1;
2174 2178          mutex_exit(&p0.p_lock);
2175 2179          zone0.zone_restart_init = B_TRUE;
2176 2180          zone0.zone_brand = &native_brand;
2177 2181          rctl_prealloc_destroy(gp);
2178 2182          /*
2179 2183           * pool_default hasn't been initialized yet, so we let pool_init()
2180 2184           * take care of making sure the global zone is in the default pool.
2181 2185           */
2182 2186  
2183 2187          /*
2184 2188           * Initialize global zone kstats
2185 2189           */
2186 2190          zone_kstat_create(&zone0);
2187 2191  
2188 2192          /*
2189 2193           * Initialize zone label.
2190 2194           * mlp are initialized when tnzonecfg is loaded.
2191 2195           */
2192 2196          zone0.zone_slabel = l_admin_low;
2193 2197          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2194 2198          label_hold(l_admin_low);
2195 2199  
2196 2200          /*
2197 2201           * Initialise the lock for the database structure used by mntfs.
2198 2202           */
2199 2203          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2200 2204  
2201 2205          mutex_enter(&zonehash_lock);
2202 2206          zone_uniqid(&zone0);
2203 2207          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2204 2208  
2205 2209          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2206 2210              mod_hash_null_valdtor);
2207 2211          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2208 2212              zone_hash_size, mod_hash_null_valdtor);
2209 2213          /*
2210 2214           * maintain zonehashbylabel only for labeled systems
2211 2215           */
2212 2216          if (is_system_labeled())
2213 2217                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2214 2218                      zone_hash_size, mod_hash_null_keydtor,
2215 2219                      mod_hash_null_valdtor, hash_bylabel, NULL,
2216 2220                      hash_labelkey_cmp, KM_SLEEP);
2217 2221          zonecount = 1;
2218 2222  
2219 2223          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2220 2224              (mod_hash_val_t)&zone0);
2221 2225          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2222 2226              (mod_hash_val_t)&zone0);
2223 2227          if (is_system_labeled()) {
2224 2228                  zone0.zone_flags |= ZF_HASHED_LABEL;
2225 2229                  (void) mod_hash_insert(zonehashbylabel,
2226 2230                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2227 2231          }
2228 2232          mutex_exit(&zonehash_lock);
2229 2233  
2230 2234          /*
2231 2235           * We avoid setting zone_kcred until now, since kcred is initialized
2232 2236           * sometime after zone_zsd_init() and before zone_init().
2233 2237           */
2234 2238          zone0.zone_kcred = kcred;
2235 2239          /*
2236 2240           * The global zone is fully initialized (except for zone_rootvp which
2237 2241           * will be set when the root filesystem is mounted).
2238 2242           */
2239 2243          global_zone = &zone0;
2240 2244  
2241 2245          /*
2242 2246           * Setup an event channel to send zone status change notifications on
2243 2247           */
2244 2248          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2245 2249              EVCH_CREAT);
2246 2250  
2247 2251          if (res)
2248 2252                  panic("Sysevent_evc_bind failed during zone setup.\n");
2249 2253  
2250 2254  }
2251 2255  
2252 2256  static void
2253 2257  zone_free(zone_t *zone)
2254 2258  {
2255 2259          ASSERT(zone != global_zone);
2256 2260          ASSERT(zone->zone_ntasks == 0);
2257 2261          ASSERT(zone->zone_nlwps == 0);
2258 2262          ASSERT(zone->zone_nprocs == 0);
2259 2263          ASSERT(zone->zone_cred_ref == 0);
2260 2264          ASSERT(zone->zone_kcred == NULL);
2261 2265          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2262 2266              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2263 2267          ASSERT(list_is_empty(&zone->zone_ref_list));
2264 2268  
2265 2269          /*
2266 2270           * Remove any zone caps.
2267 2271           */
2268 2272          cpucaps_zone_remove(zone);
2269 2273  
2270 2274          ASSERT(zone->zone_cpucap == NULL);
2271 2275  
2272 2276          /* remove from deathrow list */
2273 2277          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2274 2278                  ASSERT(zone->zone_ref == 0);
2275 2279                  mutex_enter(&zone_deathrow_lock);
2276 2280                  list_remove(&zone_deathrow, zone);
2277 2281                  mutex_exit(&zone_deathrow_lock);
2278 2282          }
2279 2283  
2280 2284          list_destroy(&zone->zone_ref_list);
2281 2285          zone_free_zsd(zone);
2282 2286          zone_free_datasets(zone);
2283 2287          list_destroy(&zone->zone_dl_list);
2284 2288  
2285 2289          if (zone->zone_rootvp != NULL)
2286 2290                  VN_RELE(zone->zone_rootvp);
2287 2291          if (zone->zone_rootpath)
2288 2292                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2289 2293          if (zone->zone_name != NULL)
2290 2294                  kmem_free(zone->zone_name, ZONENAME_MAX);
2291 2295          if (zone->zone_slabel != NULL)
2292 2296                  label_rele(zone->zone_slabel);
2293 2297          if (zone->zone_nodename != NULL)
2294 2298                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2295 2299          if (zone->zone_domain != NULL)
2296 2300                  kmem_free(zone->zone_domain, _SYS_NMLN);
2297 2301          if (zone->zone_privset != NULL)
2298 2302                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2299 2303          if (zone->zone_rctls != NULL)
2300 2304                  rctl_set_free(zone->zone_rctls);
2301 2305          if (zone->zone_bootargs != NULL)
2302 2306                  strfree(zone->zone_bootargs);
2303 2307          if (zone->zone_initname != NULL)
2304 2308                  strfree(zone->zone_initname);
2305 2309          if (zone->zone_fs_allowed != NULL)
2306 2310                  strfree(zone->zone_fs_allowed);
2307 2311          if (zone->zone_pfexecd != NULL)
2308 2312                  klpd_freelist(&zone->zone_pfexecd);
2309 2313          id_free(zoneid_space, zone->zone_id);
2310 2314          mutex_destroy(&zone->zone_lock);
2311 2315          cv_destroy(&zone->zone_cv);
2312 2316          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2313 2317          rw_destroy(&zone->zone_mntfs_db_lock);
2314 2318          kmem_free(zone, sizeof (zone_t));
2315 2319  }
2316 2320  
2317 2321  /*
2318 2322   * See block comment at the top of this file for information about zone
2319 2323   * status values.
2320 2324   */
2321 2325  /*
2322 2326   * Convenience function for setting zone status.
2323 2327   */
2324 2328  static void
2325 2329  zone_status_set(zone_t *zone, zone_status_t status)
2326 2330  {
2327 2331  
2328 2332          nvlist_t *nvl = NULL;
2329 2333          ASSERT(MUTEX_HELD(&zone_status_lock));
2330 2334          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2331 2335              status >= zone_status_get(zone));
2332 2336  
2333 2337          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2334 2338              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2335 2339              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2336 2340              zone_status_table[status]) ||
2337 2341              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2338 2342              zone_status_table[zone->zone_status]) ||
2339 2343              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2340 2344              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2341 2345              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2342 2346              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2343 2347  #ifdef DEBUG
2344 2348                  (void) printf(
2345 2349                      "Failed to allocate and send zone state change event.\n");
2346 2350  #endif
2347 2351          }
2348 2352          nvlist_free(nvl);
2349 2353  
2350 2354          zone->zone_status = status;
2351 2355  
2352 2356          cv_broadcast(&zone->zone_cv);
2353 2357  }
2354 2358  
2355 2359  /*
2356 2360   * Public function to retrieve the zone status.  The zone status may
2357 2361   * change after it is retrieved.
2358 2362   */
2359 2363  zone_status_t
2360 2364  zone_status_get(zone_t *zone)
2361 2365  {
2362 2366          return (zone->zone_status);
2363 2367  }
2364 2368  
2365 2369  static int
2366 2370  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2367 2371  {
2368 2372          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2369 2373          int err = 0;
2370 2374  
2371 2375          ASSERT(zone != global_zone);
2372 2376          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2373 2377                  goto done;      /* EFAULT or ENAMETOOLONG */
2374 2378  
2375 2379          if (zone->zone_bootargs != NULL)
2376 2380                  strfree(zone->zone_bootargs);
2377 2381  
2378 2382          zone->zone_bootargs = strdup(buf);
2379 2383  
2380 2384  done:
2381 2385          kmem_free(buf, BOOTARGS_MAX);
2382 2386          return (err);
2383 2387  }
2384 2388  
2385 2389  static int
2386 2390  zone_set_brand(zone_t *zone, const char *brand)
2387 2391  {
2388 2392          struct brand_attr *attrp;
2389 2393          brand_t *bp;
2390 2394  
2391 2395          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2392 2396          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2393 2397                  kmem_free(attrp, sizeof (struct brand_attr));
2394 2398                  return (EFAULT);
2395 2399          }
2396 2400  
2397 2401          bp = brand_register_zone(attrp);
2398 2402          kmem_free(attrp, sizeof (struct brand_attr));
2399 2403          if (bp == NULL)
2400 2404                  return (EINVAL);
2401 2405  
2402 2406          /*
2403 2407           * This is the only place where a zone can change it's brand.
2404 2408           * We already need to hold zone_status_lock to check the zone
2405 2409           * status, so we'll just use that lock to serialize zone
2406 2410           * branding requests as well.
2407 2411           */
2408 2412          mutex_enter(&zone_status_lock);
2409 2413  
2410 2414          /* Re-Branding is not allowed and the zone can't be booted yet */
2411 2415          if ((ZONE_IS_BRANDED(zone)) ||
2412 2416              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2413 2417                  mutex_exit(&zone_status_lock);
2414 2418                  brand_unregister_zone(bp);
2415 2419                  return (EINVAL);
2416 2420          }
2417 2421  
2418 2422          /* set up the brand specific data */
2419 2423          zone->zone_brand = bp;
2420 2424          ZBROP(zone)->b_init_brand_data(zone);
2421 2425  
2422 2426          mutex_exit(&zone_status_lock);
2423 2427          return (0);
2424 2428  }
2425 2429  
2426 2430  static int
2427 2431  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2428 2432  {
2429 2433          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2430 2434          int err = 0;
2431 2435  
2432 2436          ASSERT(zone != global_zone);
2433 2437          if ((err = copyinstr(zone_fs_allowed, buf,
2434 2438              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2435 2439                  goto done;
2436 2440  
2437 2441          if (zone->zone_fs_allowed != NULL)
2438 2442                  strfree(zone->zone_fs_allowed);
2439 2443  
2440 2444          zone->zone_fs_allowed = strdup(buf);
2441 2445  
2442 2446  done:
2443 2447          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2444 2448          return (err);
2445 2449  }
2446 2450  
2447 2451  static int
2448 2452  zone_set_initname(zone_t *zone, const char *zone_initname)
2449 2453  {
2450 2454          char initname[INITNAME_SZ];
2451 2455          size_t len;
2452 2456          int err = 0;
2453 2457  
2454 2458          ASSERT(zone != global_zone);
2455 2459          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2456 2460                  return (err);   /* EFAULT or ENAMETOOLONG */
2457 2461  
2458 2462          if (zone->zone_initname != NULL)
2459 2463                  strfree(zone->zone_initname);
2460 2464  
2461 2465          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2462 2466          (void) strcpy(zone->zone_initname, initname);
2463 2467          return (0);
2464 2468  }
2465 2469  
2466 2470  static int
2467 2471  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2468 2472  {
2469 2473          uint64_t mcap;
2470 2474          int err = 0;
2471 2475  
2472 2476          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2473 2477                  zone->zone_phys_mcap = mcap;
2474 2478  
2475 2479          return (err);
2476 2480  }
2477 2481  
2478 2482  static int
2479 2483  zone_set_sched_class(zone_t *zone, const char *new_class)
2480 2484  {
2481 2485          char sched_class[PC_CLNMSZ];
2482 2486          id_t classid;
2483 2487          int err;
2484 2488  
2485 2489          ASSERT(zone != global_zone);
2486 2490          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2487 2491                  return (err);   /* EFAULT or ENAMETOOLONG */
2488 2492  
2489 2493          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2490 2494                  return (set_errno(EINVAL));
2491 2495          zone->zone_defaultcid = classid;
2492 2496          ASSERT(zone->zone_defaultcid > 0 &&
2493 2497              zone->zone_defaultcid < loaded_classes);
2494 2498  
2495 2499          return (0);
2496 2500  }
2497 2501  
2498 2502  /*
2499 2503   * Block indefinitely waiting for (zone_status >= status)
2500 2504   */
2501 2505  void
2502 2506  zone_status_wait(zone_t *zone, zone_status_t status)
2503 2507  {
2504 2508          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2505 2509  
2506 2510          mutex_enter(&zone_status_lock);
2507 2511          while (zone->zone_status < status) {
2508 2512                  cv_wait(&zone->zone_cv, &zone_status_lock);
2509 2513          }
2510 2514          mutex_exit(&zone_status_lock);
2511 2515  }
2512 2516  
2513 2517  /*
2514 2518   * Private CPR-safe version of zone_status_wait().
2515 2519   */
2516 2520  static void
2517 2521  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2518 2522  {
2519 2523          callb_cpr_t cprinfo;
2520 2524  
2521 2525          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2522 2526  
2523 2527          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2524 2528              str);
2525 2529          mutex_enter(&zone_status_lock);
2526 2530          while (zone->zone_status < status) {
2527 2531                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2528 2532                  cv_wait(&zone->zone_cv, &zone_status_lock);
2529 2533                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2530 2534          }
2531 2535          /*
2532 2536           * zone_status_lock is implicitly released by the following.
2533 2537           */
2534 2538          CALLB_CPR_EXIT(&cprinfo);
2535 2539  }
2536 2540  
2537 2541  /*
2538 2542   * Block until zone enters requested state or signal is received.  Return (0)
2539 2543   * if signaled, non-zero otherwise.
2540 2544   */
2541 2545  int
2542 2546  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2543 2547  {
2544 2548          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2545 2549  
2546 2550          mutex_enter(&zone_status_lock);
2547 2551          while (zone->zone_status < status) {
2548 2552                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2549 2553                          mutex_exit(&zone_status_lock);
2550 2554                          return (0);
2551 2555                  }
2552 2556          }
2553 2557          mutex_exit(&zone_status_lock);
2554 2558          return (1);
2555 2559  }
2556 2560  
2557 2561  /*
2558 2562   * Block until the zone enters the requested state or the timeout expires,
2559 2563   * whichever happens first.  Return (-1) if operation timed out, time remaining
2560 2564   * otherwise.
2561 2565   */
2562 2566  clock_t
2563 2567  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2564 2568  {
2565 2569          clock_t timeleft = 0;
2566 2570  
2567 2571          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2568 2572  
2569 2573          mutex_enter(&zone_status_lock);
2570 2574          while (zone->zone_status < status && timeleft != -1) {
2571 2575                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2572 2576          }
2573 2577          mutex_exit(&zone_status_lock);
2574 2578          return (timeleft);
2575 2579  }
2576 2580  
2577 2581  /*
2578 2582   * Block until the zone enters the requested state, the current process is
2579 2583   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2580 2584   * operation timed out, 0 if signaled, time remaining otherwise.
2581 2585   */
2582 2586  clock_t
2583 2587  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2584 2588  {
2585 2589          clock_t timeleft = tim - ddi_get_lbolt();
2586 2590  
2587 2591          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2588 2592  
2589 2593          mutex_enter(&zone_status_lock);
2590 2594          while (zone->zone_status < status) {
2591 2595                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2592 2596                      tim);
2593 2597                  if (timeleft <= 0)
2594 2598                          break;
2595 2599          }
2596 2600          mutex_exit(&zone_status_lock);
2597 2601          return (timeleft);
2598 2602  }
2599 2603  
2600 2604  /*
2601 2605   * Zones have two reference counts: one for references from credential
2602 2606   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2603 2607   * This is so we can allow a zone to be rebooted while there are still
2604 2608   * outstanding cred references, since certain drivers cache dblks (which
2605 2609   * implicitly results in cached creds).  We wait for zone_ref to drop to
2606 2610   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2607 2611   * later freed when the zone_cred_ref drops to 0, though nothing other
2608 2612   * than the zone id and privilege set should be accessed once the zone
2609 2613   * is "dead".
2610 2614   *
2611 2615   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2612 2616   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2613 2617   * to 0.  This can be useful to flush out other sources of cached creds
2614 2618   * that may be less innocuous than the driver case.
2615 2619   *
2616 2620   * Zones also provide a tracked reference counting mechanism in which zone
2617 2621   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2618 2622   * debuggers determine the sources of leaked zone references.  See
2619 2623   * zone_hold_ref() and zone_rele_ref() below for more information.
2620 2624   */
2621 2625  
2622 2626  int zone_wait_for_cred = 0;
2623 2627  
2624 2628  static void
2625 2629  zone_hold_locked(zone_t *z)
2626 2630  {
2627 2631          ASSERT(MUTEX_HELD(&z->zone_lock));
2628 2632          z->zone_ref++;
2629 2633          ASSERT(z->zone_ref != 0);
2630 2634  }
2631 2635  
2632 2636  /*
2633 2637   * Increment the specified zone's reference count.  The zone's zone_t structure
2634 2638   * will not be freed as long as the zone's reference count is nonzero.
2635 2639   * Decrement the zone's reference count via zone_rele().
2636 2640   *
2637 2641   * NOTE: This function should only be used to hold zones for short periods of
2638 2642   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2639 2643   */
2640 2644  void
2641 2645  zone_hold(zone_t *z)
2642 2646  {
2643 2647          mutex_enter(&z->zone_lock);
2644 2648          zone_hold_locked(z);
2645 2649          mutex_exit(&z->zone_lock);
2646 2650  }
2647 2651  
2648 2652  /*
2649 2653   * If the non-cred ref count drops to 1 and either the cred ref count
2650 2654   * is 0 or we aren't waiting for cred references, the zone is ready to
2651 2655   * be destroyed.
2652 2656   */
2653 2657  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2654 2658              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2655 2659  
2656 2660  /*
2657 2661   * Common zone reference release function invoked by zone_rele() and
2658 2662   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2659 2663   * zone's subsystem-specific reference counters are not affected by the
2660 2664   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2661 2665   * removed from the specified zone's reference list.  ref must be non-NULL iff
2662 2666   * subsys is not ZONE_REF_NUM_SUBSYS.
2663 2667   */
2664 2668  static void
2665 2669  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2666 2670  {
2667 2671          boolean_t wakeup;
2668 2672  
2669 2673          mutex_enter(&z->zone_lock);
2670 2674          ASSERT(z->zone_ref != 0);
2671 2675          z->zone_ref--;
2672 2676          if (subsys != ZONE_REF_NUM_SUBSYS) {
2673 2677                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2674 2678                  z->zone_subsys_ref[subsys]--;
2675 2679                  list_remove(&z->zone_ref_list, ref);
2676 2680          }
2677 2681          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2678 2682                  /* no more refs, free the structure */
2679 2683                  mutex_exit(&z->zone_lock);
2680 2684                  zone_free(z);
2681 2685                  return;
2682 2686          }
2683 2687          /* signal zone_destroy so the zone can finish halting */
2684 2688          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2685 2689          mutex_exit(&z->zone_lock);
2686 2690  
2687 2691          if (wakeup) {
2688 2692                  /*
2689 2693                   * Grabbing zonehash_lock here effectively synchronizes with
2690 2694                   * zone_destroy() to avoid missed signals.
2691 2695                   */
2692 2696                  mutex_enter(&zonehash_lock);
2693 2697                  cv_broadcast(&zone_destroy_cv);
2694 2698                  mutex_exit(&zonehash_lock);
2695 2699          }
2696 2700  }
2697 2701  
2698 2702  /*
2699 2703   * Decrement the specified zone's reference count.  The specified zone will
2700 2704   * cease to exist after this function returns if the reference count drops to
2701 2705   * zero.  This function should be paired with zone_hold().
2702 2706   */
2703 2707  void
2704 2708  zone_rele(zone_t *z)
2705 2709  {
2706 2710          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2707 2711  }
2708 2712  
2709 2713  /*
2710 2714   * Initialize a zone reference structure.  This function must be invoked for
2711 2715   * a reference structure before the structure is passed to zone_hold_ref().
2712 2716   */
2713 2717  void
2714 2718  zone_init_ref(zone_ref_t *ref)
2715 2719  {
2716 2720          ref->zref_zone = NULL;
2717 2721          list_link_init(&ref->zref_linkage);
2718 2722  }
2719 2723  
2720 2724  /*
2721 2725   * Acquire a reference to zone z.  The caller must specify the
2722 2726   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2723 2727   * zone_ref_t structure will represent a reference to the specified zone.  Use
2724 2728   * zone_rele_ref() to release the reference.
2725 2729   *
2726 2730   * The referenced zone_t structure will not be freed as long as the zone_t's
2727 2731   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2728 2732   * references.
2729 2733   *
2730 2734   * NOTE: The zone_ref_t structure must be initialized before it is used.
2731 2735   * See zone_init_ref() above.
2732 2736   */
2733 2737  void
2734 2738  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2735 2739  {
2736 2740          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2737 2741  
2738 2742          /*
2739 2743           * Prevent consumers from reusing a reference structure before
2740 2744           * releasing it.
2741 2745           */
2742 2746          VERIFY(ref->zref_zone == NULL);
2743 2747  
2744 2748          ref->zref_zone = z;
2745 2749          mutex_enter(&z->zone_lock);
2746 2750          zone_hold_locked(z);
2747 2751          z->zone_subsys_ref[subsys]++;
2748 2752          ASSERT(z->zone_subsys_ref[subsys] != 0);
2749 2753          list_insert_head(&z->zone_ref_list, ref);
2750 2754          mutex_exit(&z->zone_lock);
2751 2755  }
2752 2756  
2753 2757  /*
2754 2758   * Release the zone reference represented by the specified zone_ref_t.
2755 2759   * The reference is invalid after it's released; however, the zone_ref_t
2756 2760   * structure can be reused without having to invoke zone_init_ref().
2757 2761   * subsys should be the same value that was passed to zone_hold_ref()
2758 2762   * when the reference was acquired.
2759 2763   */
2760 2764  void
2761 2765  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2762 2766  {
2763 2767          zone_rele_common(ref->zref_zone, ref, subsys);
2764 2768  
2765 2769          /*
2766 2770           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2767 2771           * when consumers dereference the reference.  This helps us catch
2768 2772           * consumers who use released references.  Furthermore, this lets
2769 2773           * consumers reuse the zone_ref_t structure without having to
2770 2774           * invoke zone_init_ref().
2771 2775           */
2772 2776          ref->zref_zone = NULL;
2773 2777  }
2774 2778  
2775 2779  void
2776 2780  zone_cred_hold(zone_t *z)
2777 2781  {
2778 2782          mutex_enter(&z->zone_lock);
2779 2783          z->zone_cred_ref++;
2780 2784          ASSERT(z->zone_cred_ref != 0);
2781 2785          mutex_exit(&z->zone_lock);
2782 2786  }
2783 2787  
2784 2788  void
2785 2789  zone_cred_rele(zone_t *z)
2786 2790  {
2787 2791          boolean_t wakeup;
2788 2792  
2789 2793          mutex_enter(&z->zone_lock);
2790 2794          ASSERT(z->zone_cred_ref != 0);
2791 2795          z->zone_cred_ref--;
2792 2796          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2793 2797                  /* no more refs, free the structure */
2794 2798                  mutex_exit(&z->zone_lock);
2795 2799                  zone_free(z);
2796 2800                  return;
2797 2801          }
2798 2802          /*
2799 2803           * If zone_destroy is waiting for the cred references to drain
2800 2804           * out, and they have, signal it.
2801 2805           */
2802 2806          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2803 2807              zone_status_get(z) >= ZONE_IS_DEAD);
2804 2808          mutex_exit(&z->zone_lock);
2805 2809  
2806 2810          if (wakeup) {
2807 2811                  /*
2808 2812                   * Grabbing zonehash_lock here effectively synchronizes with
2809 2813                   * zone_destroy() to avoid missed signals.
2810 2814                   */
2811 2815                  mutex_enter(&zonehash_lock);
2812 2816                  cv_broadcast(&zone_destroy_cv);
2813 2817                  mutex_exit(&zonehash_lock);
2814 2818          }
2815 2819  }
2816 2820  
2817 2821  void
2818 2822  zone_task_hold(zone_t *z)
2819 2823  {
2820 2824          mutex_enter(&z->zone_lock);
2821 2825          z->zone_ntasks++;
2822 2826          ASSERT(z->zone_ntasks != 0);
2823 2827          mutex_exit(&z->zone_lock);
2824 2828  }
2825 2829  
2826 2830  void
2827 2831  zone_task_rele(zone_t *zone)
2828 2832  {
2829 2833          uint_t refcnt;
2830 2834  
2831 2835          mutex_enter(&zone->zone_lock);
2832 2836          ASSERT(zone->zone_ntasks != 0);
2833 2837          refcnt = --zone->zone_ntasks;
2834 2838          if (refcnt > 1) {       /* Common case */
2835 2839                  mutex_exit(&zone->zone_lock);
2836 2840                  return;
2837 2841          }
2838 2842          zone_hold_locked(zone); /* so we can use the zone_t later */
2839 2843          mutex_exit(&zone->zone_lock);
2840 2844          if (refcnt == 1) {
2841 2845                  /*
2842 2846                   * See if the zone is shutting down.
2843 2847                   */
2844 2848                  mutex_enter(&zone_status_lock);
2845 2849                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2846 2850                          goto out;
2847 2851                  }
2848 2852  
2849 2853                  /*
2850 2854                   * Make sure the ntasks didn't change since we
2851 2855                   * dropped zone_lock.
2852 2856                   */
2853 2857                  mutex_enter(&zone->zone_lock);
2854 2858                  if (refcnt != zone->zone_ntasks) {
2855 2859                          mutex_exit(&zone->zone_lock);
2856 2860                          goto out;
2857 2861                  }
2858 2862                  mutex_exit(&zone->zone_lock);
2859 2863  
2860 2864                  /*
2861 2865                   * No more user processes in the zone.  The zone is empty.
2862 2866                   */
2863 2867                  zone_status_set(zone, ZONE_IS_EMPTY);
2864 2868                  goto out;
2865 2869          }
2866 2870  
2867 2871          ASSERT(refcnt == 0);
2868 2872          /*
2869 2873           * zsched has exited; the zone is dead.
2870 2874           */
2871 2875          zone->zone_zsched = NULL;               /* paranoia */
2872 2876          mutex_enter(&zone_status_lock);
2873 2877          zone_status_set(zone, ZONE_IS_DEAD);
2874 2878  out:
2875 2879          mutex_exit(&zone_status_lock);
2876 2880          zone_rele(zone);
2877 2881  }
2878 2882  
2879 2883  zoneid_t
2880 2884  getzoneid(void)
2881 2885  {
2882 2886          return (curproc->p_zone->zone_id);
2883 2887  }
2884 2888  
2885 2889  /*
2886 2890   * Internal versions of zone_find_by_*().  These don't zone_hold() or
2887 2891   * check the validity of a zone's state.
2888 2892   */
2889 2893  static zone_t *
2890 2894  zone_find_all_by_id(zoneid_t zoneid)
2891 2895  {
2892 2896          mod_hash_val_t hv;
2893 2897          zone_t *zone = NULL;
2894 2898  
2895 2899          ASSERT(MUTEX_HELD(&zonehash_lock));
2896 2900  
2897 2901          if (mod_hash_find(zonehashbyid,
2898 2902              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2899 2903                  zone = (zone_t *)hv;
2900 2904          return (zone);
2901 2905  }
2902 2906  
2903 2907  static zone_t *
2904 2908  zone_find_all_by_label(const ts_label_t *label)
2905 2909  {
2906 2910          mod_hash_val_t hv;
2907 2911          zone_t *zone = NULL;
2908 2912  
2909 2913          ASSERT(MUTEX_HELD(&zonehash_lock));
2910 2914  
2911 2915          /*
2912 2916           * zonehashbylabel is not maintained for unlabeled systems
2913 2917           */
2914 2918          if (!is_system_labeled())
2915 2919                  return (NULL);
2916 2920          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2917 2921                  zone = (zone_t *)hv;
2918 2922          return (zone);
2919 2923  }
2920 2924  
2921 2925  static zone_t *
2922 2926  zone_find_all_by_name(char *name)
2923 2927  {
2924 2928          mod_hash_val_t hv;
2925 2929          zone_t *zone = NULL;
2926 2930  
2927 2931          ASSERT(MUTEX_HELD(&zonehash_lock));
2928 2932  
2929 2933          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2930 2934                  zone = (zone_t *)hv;
2931 2935          return (zone);
2932 2936  }
2933 2937  
2934 2938  /*
2935 2939   * Public interface for looking up a zone by zoneid.  Only returns the zone if
2936 2940   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2937 2941   * Caller must call zone_rele() once it is done with the zone.
2938 2942   *
2939 2943   * The zone may begin the zone_destroy() sequence immediately after this
2940 2944   * function returns, but may be safely used until zone_rele() is called.
2941 2945   */
2942 2946  zone_t *
2943 2947  zone_find_by_id(zoneid_t zoneid)
2944 2948  {
2945 2949          zone_t *zone;
2946 2950          zone_status_t status;
2947 2951  
2948 2952          mutex_enter(&zonehash_lock);
2949 2953          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2950 2954                  mutex_exit(&zonehash_lock);
2951 2955                  return (NULL);
2952 2956          }
2953 2957          status = zone_status_get(zone);
2954 2958          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2955 2959                  /*
2956 2960                   * For all practical purposes the zone doesn't exist.
2957 2961                   */
2958 2962                  mutex_exit(&zonehash_lock);
2959 2963                  return (NULL);
2960 2964          }
2961 2965          zone_hold(zone);
2962 2966          mutex_exit(&zonehash_lock);
2963 2967          return (zone);
2964 2968  }
2965 2969  
2966 2970  /*
2967 2971   * Similar to zone_find_by_id, but using zone label as the key.
2968 2972   */
2969 2973  zone_t *
2970 2974  zone_find_by_label(const ts_label_t *label)
2971 2975  {
2972 2976          zone_t *zone;
2973 2977          zone_status_t status;
2974 2978  
2975 2979          mutex_enter(&zonehash_lock);
2976 2980          if ((zone = zone_find_all_by_label(label)) == NULL) {
2977 2981                  mutex_exit(&zonehash_lock);
2978 2982                  return (NULL);
2979 2983          }
2980 2984  
2981 2985          status = zone_status_get(zone);
2982 2986          if (status > ZONE_IS_DOWN) {
2983 2987                  /*
2984 2988                   * For all practical purposes the zone doesn't exist.
2985 2989                   */
2986 2990                  mutex_exit(&zonehash_lock);
2987 2991                  return (NULL);
2988 2992          }
2989 2993          zone_hold(zone);
2990 2994          mutex_exit(&zonehash_lock);
2991 2995          return (zone);
2992 2996  }
2993 2997  
2994 2998  /*
2995 2999   * Similar to zone_find_by_id, but using zone name as the key.
2996 3000   */
2997 3001  zone_t *
2998 3002  zone_find_by_name(char *name)
2999 3003  {
3000 3004          zone_t *zone;
3001 3005          zone_status_t status;
3002 3006  
3003 3007          mutex_enter(&zonehash_lock);
3004 3008          if ((zone = zone_find_all_by_name(name)) == NULL) {
3005 3009                  mutex_exit(&zonehash_lock);
3006 3010                  return (NULL);
3007 3011          }
3008 3012          status = zone_status_get(zone);
3009 3013          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3010 3014                  /*
3011 3015                   * For all practical purposes the zone doesn't exist.
3012 3016                   */
3013 3017                  mutex_exit(&zonehash_lock);
3014 3018                  return (NULL);
3015 3019          }
3016 3020          zone_hold(zone);
3017 3021          mutex_exit(&zonehash_lock);
3018 3022          return (zone);
3019 3023  }
3020 3024  
3021 3025  /*
3022 3026   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3023 3027   * if there is a zone "foo" rooted at /foo/root, and the path argument
3024 3028   * is "/foo/root/proc", it will return the held zone_t corresponding to
3025 3029   * zone "foo".
3026 3030   *
3027 3031   * zone_find_by_path() always returns a non-NULL value, since at the
3028 3032   * very least every path will be contained in the global zone.
3029 3033   *
3030 3034   * As with the other zone_find_by_*() functions, the caller is
3031 3035   * responsible for zone_rele()ing the return value of this function.
3032 3036   */
3033 3037  zone_t *
3034 3038  zone_find_by_path(const char *path)
3035 3039  {
3036 3040          zone_t *zone;
3037 3041          zone_t *zret = NULL;
3038 3042          zone_status_t status;
3039 3043  
3040 3044          if (path == NULL) {
3041 3045                  /*
3042 3046                   * Call from rootconf().
3043 3047                   */
3044 3048                  zone_hold(global_zone);
3045 3049                  return (global_zone);
3046 3050          }
3047 3051          ASSERT(*path == '/');
3048 3052          mutex_enter(&zonehash_lock);
3049 3053          for (zone = list_head(&zone_active); zone != NULL;
3050 3054              zone = list_next(&zone_active, zone)) {
3051 3055                  if (ZONE_PATH_VISIBLE(path, zone))
3052 3056                          zret = zone;
3053 3057          }
3054 3058          ASSERT(zret != NULL);
3055 3059          status = zone_status_get(zret);
3056 3060          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3057 3061                  /*
3058 3062                   * Zone practically doesn't exist.
3059 3063                   */
3060 3064                  zret = global_zone;
3061 3065          }
3062 3066          zone_hold(zret);
3063 3067          mutex_exit(&zonehash_lock);
3064 3068          return (zret);
3065 3069  }
3066 3070  
3067 3071  /*
3068 3072   * Public interface for updating per-zone load averages.  Called once per
3069 3073   * second.
3070 3074   *
3071 3075   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3072 3076   */
3073 3077  void
3074 3078  zone_loadavg_update()
3075 3079  {
3076 3080          zone_t *zp;
3077 3081          zone_status_t status;
3078 3082          struct loadavg_s *lavg;
3079 3083          hrtime_t zone_total;
3080 3084          int i;
3081 3085          hrtime_t hr_avg;
3082 3086          int nrun;
3083 3087          static int64_t f[3] = { 135, 27, 9 };
3084 3088          int64_t q, r;
3085 3089  
3086 3090          mutex_enter(&zonehash_lock);
3087 3091          for (zp = list_head(&zone_active); zp != NULL;
3088 3092              zp = list_next(&zone_active, zp)) {
3089 3093                  mutex_enter(&zp->zone_lock);
3090 3094  
3091 3095                  /* Skip zones that are on the way down or not yet up */
3092 3096                  status = zone_status_get(zp);
3093 3097                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3094 3098                          /* For all practical purposes the zone doesn't exist. */
3095 3099                          mutex_exit(&zp->zone_lock);
3096 3100                          continue;
3097 3101                  }
3098 3102  
3099 3103                  /*
3100 3104                   * Update the 10 second moving average data in zone_loadavg.
3101 3105                   */
3102 3106                  lavg = &zp->zone_loadavg;
3103 3107  
3104 3108                  zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3105 3109                  scalehrtime(&zone_total);
3106 3110  
3107 3111                  /* The zone_total should always be increasing. */
3108 3112                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3109 3113                      zone_total - lavg->lg_total : 0;
3110 3114                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3111 3115                  /* lg_total holds the prev. 1 sec. total */
3112 3116                  lavg->lg_total = zone_total;
3113 3117  
3114 3118                  /*
3115 3119                   * To simplify the calculation, we don't calculate the load avg.
3116 3120                   * until the zone has been up for at least 10 seconds and our
3117 3121                   * moving average is thus full.
3118 3122                   */
3119 3123                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3120 3124                          lavg->lg_len++;
3121 3125                          mutex_exit(&zp->zone_lock);
3122 3126                          continue;
3123 3127                  }
3124 3128  
3125 3129                  /* Now calculate the 1min, 5min, 15 min load avg. */
3126 3130                  hr_avg = 0;
3127 3131                  for (i = 0; i < S_LOADAVG_SZ; i++)
3128 3132                          hr_avg += lavg->lg_loads[i];
3129 3133                  hr_avg = hr_avg / S_LOADAVG_SZ;
3130 3134                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3131 3135  
3132 3136                  /* Compute load avg. See comment in calcloadavg() */
3133 3137                  for (i = 0; i < 3; i++) {
3134 3138                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3135 3139                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3136 3140                          zp->zone_hp_avenrun[i] +=
3137 3141                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3138 3142  
3139 3143                          /* avenrun[] can only hold 31 bits of load avg. */
3140 3144                          if (zp->zone_hp_avenrun[i] <
3141 3145                              ((uint64_t)1<<(31+16-FSHIFT)))
3142 3146                                  zp->zone_avenrun[i] = (int32_t)
3143 3147                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3144 3148                          else
3145 3149                                  zp->zone_avenrun[i] = 0x7fffffff;
3146 3150                  }
3147 3151  
3148 3152                  mutex_exit(&zp->zone_lock);
3149 3153          }
3150 3154          mutex_exit(&zonehash_lock);
3151 3155  }
3152 3156  
3153 3157  /*
3154 3158   * Get the number of cpus visible to this zone.  The system-wide global
3155 3159   * 'ncpus' is returned if pools are disabled, the caller is in the
3156 3160   * global zone, or a NULL zone argument is passed in.
3157 3161   */
3158 3162  int
3159 3163  zone_ncpus_get(zone_t *zone)
3160 3164  {
3161 3165          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3162 3166  
3163 3167          return (myncpus != 0 ? myncpus : ncpus);
3164 3168  }
3165 3169  
3166 3170  /*
3167 3171   * Get the number of online cpus visible to this zone.  The system-wide
3168 3172   * global 'ncpus_online' is returned if pools are disabled, the caller
3169 3173   * is in the global zone, or a NULL zone argument is passed in.
3170 3174   */
3171 3175  int
3172 3176  zone_ncpus_online_get(zone_t *zone)
3173 3177  {
3174 3178          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3175 3179  
3176 3180          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3177 3181  }
3178 3182  
3179 3183  /*
3180 3184   * Return the pool to which the zone is currently bound.
3181 3185   */
3182 3186  pool_t *
3183 3187  zone_pool_get(zone_t *zone)
3184 3188  {
3185 3189          ASSERT(pool_lock_held());
3186 3190  
3187 3191          return (zone->zone_pool);
3188 3192  }
3189 3193  
3190 3194  /*
3191 3195   * Set the zone's pool pointer and update the zone's visibility to match
3192 3196   * the resources in the new pool.
3193 3197   */
3194 3198  void
3195 3199  zone_pool_set(zone_t *zone, pool_t *pool)
3196 3200  {
3197 3201          ASSERT(pool_lock_held());
3198 3202          ASSERT(MUTEX_HELD(&cpu_lock));
3199 3203  
3200 3204          zone->zone_pool = pool;
3201 3205          zone_pset_set(zone, pool->pool_pset->pset_id);
3202 3206  }
3203 3207  
3204 3208  /*
3205 3209   * Return the cached value of the id of the processor set to which the
3206 3210   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3207 3211   * facility is disabled.
3208 3212   */
3209 3213  psetid_t
3210 3214  zone_pset_get(zone_t *zone)
3211 3215  {
3212 3216          ASSERT(MUTEX_HELD(&cpu_lock));
3213 3217  
3214 3218          return (zone->zone_psetid);
3215 3219  }
3216 3220  
3217 3221  /*
3218 3222   * Set the cached value of the id of the processor set to which the zone
3219 3223   * is currently bound.  Also update the zone's visibility to match the
3220 3224   * resources in the new processor set.
3221 3225   */
3222 3226  void
3223 3227  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3224 3228  {
3225 3229          psetid_t oldpsetid;
3226 3230  
3227 3231          ASSERT(MUTEX_HELD(&cpu_lock));
3228 3232          oldpsetid = zone_pset_get(zone);
3229 3233  
3230 3234          if (oldpsetid == newpsetid)
3231 3235                  return;
3232 3236          /*
3233 3237           * Global zone sees all.
3234 3238           */
3235 3239          if (zone != global_zone) {
3236 3240                  zone->zone_psetid = newpsetid;
3237 3241                  if (newpsetid != ZONE_PS_INVAL)
3238 3242                          pool_pset_visibility_add(newpsetid, zone);
3239 3243                  if (oldpsetid != ZONE_PS_INVAL)
3240 3244                          pool_pset_visibility_remove(oldpsetid, zone);
3241 3245          }
3242 3246          /*
3243 3247           * Disabling pools, so we should start using the global values
3244 3248           * for ncpus and ncpus_online.
3245 3249           */
3246 3250          if (newpsetid == ZONE_PS_INVAL) {
3247 3251                  zone->zone_ncpus = 0;
3248 3252                  zone->zone_ncpus_online = 0;
3249 3253          }
3250 3254  }
3251 3255  
3252 3256  /*
3253 3257   * Walk the list of active zones and issue the provided callback for
3254 3258   * each of them.
3255 3259   *
3256 3260   * Caller must not be holding any locks that may be acquired under
3257 3261   * zonehash_lock.  See comment at the beginning of the file for a list of
3258 3262   * common locks and their interactions with zones.
3259 3263   */
3260 3264  int
3261 3265  zone_walk(int (*cb)(zone_t *, void *), void *data)
3262 3266  {
3263 3267          zone_t *zone;
3264 3268          int ret = 0;
3265 3269          zone_status_t status;
3266 3270  
3267 3271          mutex_enter(&zonehash_lock);
3268 3272          for (zone = list_head(&zone_active); zone != NULL;
3269 3273              zone = list_next(&zone_active, zone)) {
3270 3274                  /*
3271 3275                   * Skip zones that shouldn't be externally visible.
3272 3276                   */
3273 3277                  status = zone_status_get(zone);
3274 3278                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3275 3279                          continue;
3276 3280                  /*
3277 3281                   * Bail immediately if any callback invocation returns a
3278 3282                   * non-zero value.
3279 3283                   */
3280 3284                  ret = (*cb)(zone, data);
3281 3285                  if (ret != 0)
3282 3286                          break;
3283 3287          }
3284 3288          mutex_exit(&zonehash_lock);
3285 3289          return (ret);
3286 3290  }
3287 3291  
3288 3292  static int
3289 3293  zone_set_root(zone_t *zone, const char *upath)
3290 3294  {
3291 3295          vnode_t *vp;
3292 3296          int trycount;
3293 3297          int error = 0;
3294 3298          char *path;
3295 3299          struct pathname upn, pn;
3296 3300          size_t pathlen;
3297 3301  
3298 3302          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3299 3303                  return (error);
3300 3304  
3301 3305          pn_alloc(&pn);
3302 3306  
3303 3307          /* prevent infinite loop */
3304 3308          trycount = 10;
3305 3309          for (;;) {
3306 3310                  if (--trycount <= 0) {
3307 3311                          error = ESTALE;
3308 3312                          goto out;
3309 3313                  }
3310 3314  
3311 3315                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3312 3316                          /*
3313 3317                           * VOP_ACCESS() may cover 'vp' with a new
3314 3318                           * filesystem, if 'vp' is an autoFS vnode.
3315 3319                           * Get the new 'vp' if so.
3316 3320                           */
3317 3321                          if ((error =
3318 3322                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3319 3323                              (!vn_ismntpt(vp) ||
3320 3324                              (error = traverse(&vp)) == 0)) {
3321 3325                                  pathlen = pn.pn_pathlen + 2;
3322 3326                                  path = kmem_alloc(pathlen, KM_SLEEP);
3323 3327                                  (void) strncpy(path, pn.pn_path,
3324 3328                                      pn.pn_pathlen + 1);
3325 3329                                  path[pathlen - 2] = '/';
3326 3330                                  path[pathlen - 1] = '\0';
3327 3331                                  pn_free(&pn);
3328 3332                                  pn_free(&upn);
3329 3333  
3330 3334                                  /* Success! */
3331 3335                                  break;
3332 3336                          }
3333 3337                          VN_RELE(vp);
3334 3338                  }
3335 3339                  if (error != ESTALE)
3336 3340                          goto out;
3337 3341          }
3338 3342  
3339 3343          ASSERT(error == 0);
3340 3344          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3341 3345          zone->zone_rootpath = path;
3342 3346          zone->zone_rootpathlen = pathlen;
3343 3347          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3344 3348                  zone->zone_flags |= ZF_IS_SCRATCH;
3345 3349          return (0);
3346 3350  
3347 3351  out:
3348 3352          pn_free(&pn);
3349 3353          pn_free(&upn);
3350 3354          return (error);
3351 3355  }
3352 3356  
3353 3357  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3354 3358                          ((c) >= 'a' && (c) <= 'z') || \
3355 3359                          ((c) >= 'A' && (c) <= 'Z'))
3356 3360  
3357 3361  static int
3358 3362  zone_set_name(zone_t *zone, const char *uname)
3359 3363  {
3360 3364          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3361 3365          size_t len;
3362 3366          int i, err;
3363 3367  
3364 3368          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3365 3369                  kmem_free(kname, ZONENAME_MAX);
3366 3370                  return (err);   /* EFAULT or ENAMETOOLONG */
3367 3371          }
3368 3372  
3369 3373          /* must be less than ZONENAME_MAX */
3370 3374          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3371 3375                  kmem_free(kname, ZONENAME_MAX);
3372 3376                  return (EINVAL);
3373 3377          }
3374 3378  
3375 3379          /*
3376 3380           * Name must start with an alphanumeric and must contain only
3377 3381           * alphanumerics, '-', '_' and '.'.
3378 3382           */
3379 3383          if (!isalnum(kname[0])) {
3380 3384                  kmem_free(kname, ZONENAME_MAX);
3381 3385                  return (EINVAL);
3382 3386          }
3383 3387          for (i = 1; i < len - 1; i++) {
3384 3388                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3385 3389                      kname[i] != '.') {
3386 3390                          kmem_free(kname, ZONENAME_MAX);
3387 3391                          return (EINVAL);
3388 3392                  }
3389 3393          }
3390 3394  
3391 3395          zone->zone_name = kname;
3392 3396          return (0);
3393 3397  }
3394 3398  
3395 3399  /*
3396 3400   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3397 3401   * is NULL or it points to a zone with no hostid emulation, then the machine's
3398 3402   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3399 3403   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3400 3404   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3401 3405   * hostid and the machine's hostid is invalid.
3402 3406   */
3403 3407  uint32_t
3404 3408  zone_get_hostid(zone_t *zonep)
3405 3409  {
3406 3410          unsigned long machine_hostid;
3407 3411  
3408 3412          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3409 3413                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3410 3414                          return (HW_INVALID_HOSTID);
3411 3415                  return ((uint32_t)machine_hostid);
3412 3416          }
3413 3417          return (zonep->zone_hostid);
3414 3418  }
3415 3419  
3416 3420  /*
3417 3421   * Similar to thread_create(), but makes sure the thread is in the appropriate
3418 3422   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3419 3423   */
3420 3424  /*ARGSUSED*/
3421 3425  kthread_t *
3422 3426  zthread_create(
3423 3427      caddr_t stk,
3424 3428      size_t stksize,
3425 3429      void (*proc)(),
3426 3430      void *arg,
3427 3431      size_t len,
3428 3432      pri_t pri)
3429 3433  {
3430 3434          kthread_t *t;
3431 3435          zone_t *zone = curproc->p_zone;
3432 3436          proc_t *pp = zone->zone_zsched;
3433 3437  
3434 3438          zone_hold(zone);        /* Reference to be dropped when thread exits */
3435 3439  
3436 3440          /*
3437 3441           * No-one should be trying to create threads if the zone is shutting
3438 3442           * down and there aren't any kernel threads around.  See comment
3439 3443           * in zthread_exit().
3440 3444           */
3441 3445          ASSERT(!(zone->zone_kthreads == NULL &&
3442 3446              zone_status_get(zone) >= ZONE_IS_EMPTY));
3443 3447          /*
3444 3448           * Create a thread, but don't let it run until we've finished setting
3445 3449           * things up.
3446 3450           */
3447 3451          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3448 3452          ASSERT(t->t_forw == NULL);
3449 3453          mutex_enter(&zone_status_lock);
3450 3454          if (zone->zone_kthreads == NULL) {
3451 3455                  t->t_forw = t->t_back = t;
3452 3456          } else {
3453 3457                  kthread_t *tx = zone->zone_kthreads;
3454 3458  
3455 3459                  t->t_forw = tx;
3456 3460                  t->t_back = tx->t_back;
3457 3461                  tx->t_back->t_forw = t;
3458 3462                  tx->t_back = t;
3459 3463          }
3460 3464          zone->zone_kthreads = t;
3461 3465          mutex_exit(&zone_status_lock);
3462 3466  
3463 3467          mutex_enter(&pp->p_lock);
3464 3468          t->t_proc_flag |= TP_ZTHREAD;
3465 3469          project_rele(t->t_proj);
3466 3470          t->t_proj = project_hold(pp->p_task->tk_proj);
3467 3471  
3468 3472          /*
3469 3473           * Setup complete, let it run.
3470 3474           */
3471 3475          thread_lock(t);
3472 3476          t->t_schedflag |= TS_ALLSTART;
3473 3477          setrun_locked(t);
3474 3478          thread_unlock(t);
3475 3479  
3476 3480          mutex_exit(&pp->p_lock);
3477 3481  
3478 3482          return (t);
3479 3483  }
3480 3484  
3481 3485  /*
3482 3486   * Similar to thread_exit().  Must be called by threads created via
3483 3487   * zthread_exit().
3484 3488   */
3485 3489  void
3486 3490  zthread_exit(void)
3487 3491  {
3488 3492          kthread_t *t = curthread;
3489 3493          proc_t *pp = curproc;
3490 3494          zone_t *zone = pp->p_zone;
3491 3495  
3492 3496          mutex_enter(&zone_status_lock);
3493 3497  
3494 3498          /*
3495 3499           * Reparent to p0
3496 3500           */
3497 3501          kpreempt_disable();
3498 3502          mutex_enter(&pp->p_lock);
3499 3503          t->t_proc_flag &= ~TP_ZTHREAD;
3500 3504          t->t_procp = &p0;
3501 3505          hat_thread_exit(t);
3502 3506          mutex_exit(&pp->p_lock);
3503 3507          kpreempt_enable();
3504 3508  
3505 3509          if (t->t_back == t) {
3506 3510                  ASSERT(t->t_forw == t);
3507 3511                  /*
3508 3512                   * If the zone is empty, once the thread count
3509 3513                   * goes to zero no further kernel threads can be
3510 3514                   * created.  This is because if the creator is a process
3511 3515                   * in the zone, then it must have exited before the zone
3512 3516                   * state could be set to ZONE_IS_EMPTY.
3513 3517                   * Otherwise, if the creator is a kernel thread in the
3514 3518                   * zone, the thread count is non-zero.
3515 3519                   *
3516 3520                   * This really means that non-zone kernel threads should
3517 3521                   * not create zone kernel threads.
3518 3522                   */
3519 3523                  zone->zone_kthreads = NULL;
3520 3524                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3521 3525                          zone_status_set(zone, ZONE_IS_DOWN);
3522 3526                          /*
3523 3527                           * Remove any CPU caps on this zone.
3524 3528                           */
3525 3529                          cpucaps_zone_remove(zone);
3526 3530                  }
3527 3531          } else {
3528 3532                  t->t_forw->t_back = t->t_back;
3529 3533                  t->t_back->t_forw = t->t_forw;
3530 3534                  if (zone->zone_kthreads == t)
3531 3535                          zone->zone_kthreads = t->t_forw;
3532 3536          }
3533 3537          mutex_exit(&zone_status_lock);
3534 3538          zone_rele(zone);
3535 3539          thread_exit();
3536 3540          /* NOTREACHED */
3537 3541  }
3538 3542  
3539 3543  static void
3540 3544  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3541 3545  {
3542 3546          vnode_t *oldvp;
3543 3547  
3544 3548          /* we're going to hold a reference here to the directory */
3545 3549          VN_HOLD(vp);
3546 3550  
3547 3551          /* update abs cwd/root path see c2/audit.c */
3548 3552          if (AU_AUDITING())
3549 3553                  audit_chdirec(vp, vpp);
3550 3554  
3551 3555          mutex_enter(&pp->p_lock);
3552 3556          oldvp = *vpp;
3553 3557          *vpp = vp;
3554 3558          mutex_exit(&pp->p_lock);
3555 3559          if (oldvp != NULL)
3556 3560                  VN_RELE(oldvp);
3557 3561  }
3558 3562  
3559 3563  /*
3560 3564   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3561 3565   */
3562 3566  static int
3563 3567  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3564 3568  {
3565 3569          nvpair_t *nvp = NULL;
3566 3570          boolean_t priv_set = B_FALSE;
3567 3571          boolean_t limit_set = B_FALSE;
3568 3572          boolean_t action_set = B_FALSE;
3569 3573  
3570 3574          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3571 3575                  const char *name;
3572 3576                  uint64_t ui64;
3573 3577  
3574 3578                  name = nvpair_name(nvp);
3575 3579                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3576 3580                          return (EINVAL);
3577 3581                  (void) nvpair_value_uint64(nvp, &ui64);
3578 3582                  if (strcmp(name, "privilege") == 0) {
3579 3583                          /*
3580 3584                           * Currently only privileged values are allowed, but
3581 3585                           * this may change in the future.
3582 3586                           */
3583 3587                          if (ui64 != RCPRIV_PRIVILEGED)
3584 3588                                  return (EINVAL);
3585 3589                          rv->rcv_privilege = ui64;
3586 3590                          priv_set = B_TRUE;
3587 3591                  } else if (strcmp(name, "limit") == 0) {
3588 3592                          rv->rcv_value = ui64;
3589 3593                          limit_set = B_TRUE;
3590 3594                  } else if (strcmp(name, "action") == 0) {
3591 3595                          if (ui64 != RCTL_LOCAL_NOACTION &&
3592 3596                              ui64 != RCTL_LOCAL_DENY)
3593 3597                                  return (EINVAL);
3594 3598                          rv->rcv_flagaction = ui64;
3595 3599                          action_set = B_TRUE;
3596 3600                  } else {
3597 3601                          return (EINVAL);
3598 3602                  }
3599 3603          }
3600 3604  
3601 3605          if (!(priv_set && limit_set && action_set))
3602 3606                  return (EINVAL);
3603 3607          rv->rcv_action_signal = 0;
3604 3608          rv->rcv_action_recipient = NULL;
3605 3609          rv->rcv_action_recip_pid = -1;
3606 3610          rv->rcv_firing_time = 0;
3607 3611  
3608 3612          return (0);
3609 3613  }
3610 3614  
3611 3615  /*
3612 3616   * Non-global zone version of start_init.
3613 3617   */
3614 3618  void
3615 3619  zone_start_init(void)
3616 3620  {
3617 3621          proc_t *p = ttoproc(curthread);
3618 3622          zone_t *z = p->p_zone;
3619 3623  
3620 3624          ASSERT(!INGLOBALZONE(curproc));
3621 3625  
3622 3626          /*
3623 3627           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3624 3628           * storing just the pid of init is sufficient.
3625 3629           */
3626 3630          z->zone_proc_initpid = p->p_pid;
3627 3631  
3628 3632          /*
3629 3633           * We maintain zone_boot_err so that we can return the cause of the
3630 3634           * failure back to the caller of the zone_boot syscall.
3631 3635           */
3632 3636          p->p_zone->zone_boot_err = start_init_common();
3633 3637  
3634 3638          /*
3635 3639           * We will prevent booting zones from becoming running zones if the
3636 3640           * global zone is shutting down.
3637 3641           */
3638 3642          mutex_enter(&zone_status_lock);
3639 3643          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3640 3644              ZONE_IS_SHUTTING_DOWN) {
3641 3645                  /*
3642 3646                   * Make sure we are still in the booting state-- we could have
3643 3647                   * raced and already be shutting down, or even further along.
3644 3648                   */
3645 3649                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3646 3650                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3647 3651                  }
3648 3652                  mutex_exit(&zone_status_lock);
3649 3653                  /* It's gone bad, dispose of the process */
3650 3654                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3651 3655                          mutex_enter(&p->p_lock);
3652 3656                          ASSERT(p->p_flag & SEXITLWPS);
3653 3657                          lwp_exit();
3654 3658                  }
3655 3659          } else {
3656 3660                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3657 3661                          zone_status_set(z, ZONE_IS_RUNNING);
3658 3662                  mutex_exit(&zone_status_lock);
3659 3663                  /* cause the process to return to userland. */
3660 3664                  lwp_rtt();
3661 3665          }
3662 3666  }
3663 3667  
3664 3668  struct zsched_arg {
3665 3669          zone_t *zone;
3666 3670          nvlist_t *nvlist;
3667 3671  };
3668 3672  
3669 3673  /*
3670 3674   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3671 3675   * anything to do with scheduling, but rather with the fact that
3672 3676   * per-zone kernel threads are parented to zsched, just like regular
3673 3677   * kernel threads are parented to sched (p0).
3674 3678   *
3675 3679   * zsched is also responsible for launching init for the zone.
3676 3680   */
3677 3681  static void
3678 3682  zsched(void *arg)
3679 3683  {
3680 3684          struct zsched_arg *za = arg;
3681 3685          proc_t *pp = curproc;
3682 3686          proc_t *initp = proc_init;
3683 3687          zone_t *zone = za->zone;
3684 3688          cred_t *cr, *oldcred;
3685 3689          rctl_set_t *set;
3686 3690          rctl_alloc_gp_t *gp;
3687 3691          contract_t *ct = NULL;
3688 3692          task_t *tk, *oldtk;
3689 3693          rctl_entity_p_t e;
3690 3694          kproject_t *pj;
3691 3695  
3692 3696          nvlist_t *nvl = za->nvlist;
3693 3697          nvpair_t *nvp = NULL;
3694 3698  
3695 3699          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3696 3700          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3697 3701          PTOU(pp)->u_argc = 0;
3698 3702          PTOU(pp)->u_argv = NULL;
3699 3703          PTOU(pp)->u_envp = NULL;
3700 3704          closeall(P_FINFO(pp));
3701 3705  
3702 3706          /*
3703 3707           * We are this zone's "zsched" process.  As the zone isn't generally
3704 3708           * visible yet we don't need to grab any locks before initializing its
3705 3709           * zone_proc pointer.
3706 3710           */
3707 3711          zone_hold(zone);  /* this hold is released by zone_destroy() */
3708 3712          zone->zone_zsched = pp;
3709 3713          mutex_enter(&pp->p_lock);
3710 3714          pp->p_zone = zone;
3711 3715          mutex_exit(&pp->p_lock);
3712 3716  
3713 3717          /*
3714 3718           * Disassociate process from its 'parent'; parent ourselves to init
3715 3719           * (pid 1) and change other values as needed.
3716 3720           */
3717 3721          sess_create();
3718 3722  
3719 3723          mutex_enter(&pidlock);
3720 3724          proc_detach(pp);
3721 3725          pp->p_ppid = 1;
3722 3726          pp->p_flag |= SZONETOP;
3723 3727          pp->p_ancpid = 1;
3724 3728          pp->p_parent = initp;
3725 3729          pp->p_psibling = NULL;
3726 3730          if (initp->p_child)
3727 3731                  initp->p_child->p_psibling = pp;
3728 3732          pp->p_sibling = initp->p_child;
3729 3733          initp->p_child = pp;
3730 3734  
3731 3735          /* Decrement what newproc() incremented. */
3732 3736          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3733 3737          /*
3734 3738           * Our credentials are about to become kcred-like, so we don't care
3735 3739           * about the caller's ruid.
3736 3740           */
3737 3741          upcount_inc(crgetruid(kcred), zone->zone_id);
3738 3742          mutex_exit(&pidlock);
3739 3743  
3740 3744          /*
3741 3745           * getting out of global zone, so decrement lwp and process counts
3742 3746           */
3743 3747          pj = pp->p_task->tk_proj;
3744 3748          mutex_enter(&global_zone->zone_nlwps_lock);
3745 3749          pj->kpj_nlwps -= pp->p_lwpcnt;
3746 3750          global_zone->zone_nlwps -= pp->p_lwpcnt;
3747 3751          pj->kpj_nprocs--;
3748 3752          global_zone->zone_nprocs--;
3749 3753          mutex_exit(&global_zone->zone_nlwps_lock);
3750 3754  
3751 3755          /*
3752 3756           * Decrement locked memory counts on old zone and project.
3753 3757           */
3754 3758          mutex_enter(&global_zone->zone_mem_lock);
3755 3759          global_zone->zone_locked_mem -= pp->p_locked_mem;
3756 3760          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3757 3761          mutex_exit(&global_zone->zone_mem_lock);
3758 3762  
3759 3763          /*
3760 3764           * Create and join a new task in project '0' of this zone.
3761 3765           *
3762 3766           * We don't need to call holdlwps() since we know we're the only lwp in
3763 3767           * this process.
3764 3768           *
3765 3769           * task_join() returns with p_lock held.
3766 3770           */
3767 3771          tk = task_create(0, zone);
3768 3772          mutex_enter(&cpu_lock);
3769 3773          oldtk = task_join(tk, 0);
3770 3774  
3771 3775          pj = pp->p_task->tk_proj;
3772 3776  
3773 3777          mutex_enter(&zone->zone_mem_lock);
3774 3778          zone->zone_locked_mem += pp->p_locked_mem;
3775 3779          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3776 3780          mutex_exit(&zone->zone_mem_lock);
3777 3781  
3778 3782          /*
3779 3783           * add lwp and process counts to zsched's zone, and increment
3780 3784           * project's task and process count due to the task created in
3781 3785           * the above task_create.
3782 3786           */
3783 3787          mutex_enter(&zone->zone_nlwps_lock);
3784 3788          pj->kpj_nlwps += pp->p_lwpcnt;
3785 3789          pj->kpj_ntasks += 1;
3786 3790          zone->zone_nlwps += pp->p_lwpcnt;
3787 3791          pj->kpj_nprocs++;
3788 3792          zone->zone_nprocs++;
3789 3793          mutex_exit(&zone->zone_nlwps_lock);
3790 3794  
3791 3795          mutex_exit(&curproc->p_lock);
3792 3796          mutex_exit(&cpu_lock);
3793 3797          task_rele(oldtk);
3794 3798  
3795 3799          /*
3796 3800           * The process was created by a process in the global zone, hence the
3797 3801           * credentials are wrong.  We might as well have kcred-ish credentials.
3798 3802           */
3799 3803          cr = zone->zone_kcred;
3800 3804          crhold(cr);
3801 3805          mutex_enter(&pp->p_crlock);
3802 3806          oldcred = pp->p_cred;
3803 3807          pp->p_cred = cr;
3804 3808          mutex_exit(&pp->p_crlock);
3805 3809          crfree(oldcred);
3806 3810  
3807 3811          /*
3808 3812           * Hold credentials again (for thread)
3809 3813           */
3810 3814          crhold(cr);
3811 3815  
3812 3816          /*
3813 3817           * p_lwpcnt can't change since this is a kernel process.
3814 3818           */
3815 3819          crset(pp, cr);
3816 3820  
3817 3821          /*
3818 3822           * Chroot
3819 3823           */
3820 3824          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3821 3825          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3822 3826  
3823 3827          /*
3824 3828           * Initialize zone's rctl set.
3825 3829           */
3826 3830          set = rctl_set_create();
3827 3831          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3828 3832          mutex_enter(&pp->p_lock);
3829 3833          e.rcep_p.zone = zone;
3830 3834          e.rcep_t = RCENTITY_ZONE;
3831 3835          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3832 3836          mutex_exit(&pp->p_lock);
3833 3837          rctl_prealloc_destroy(gp);
3834 3838  
3835 3839          /*
3836 3840           * Apply the rctls passed in to zone_create().  This is basically a list
3837 3841           * assignment: all of the old values are removed and the new ones
3838 3842           * inserted.  That is, if an empty list is passed in, all values are
3839 3843           * removed.
3840 3844           */
3841 3845          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3842 3846                  rctl_dict_entry_t *rde;
3843 3847                  rctl_hndl_t hndl;
3844 3848                  char *name;
3845 3849                  nvlist_t **nvlarray;
3846 3850                  uint_t i, nelem;
3847 3851                  int error;      /* For ASSERT()s */
3848 3852  
3849 3853                  name = nvpair_name(nvp);
3850 3854                  hndl = rctl_hndl_lookup(name);
3851 3855                  ASSERT(hndl != -1);
3852 3856                  rde = rctl_dict_lookup_hndl(hndl);
3853 3857                  ASSERT(rde != NULL);
3854 3858  
3855 3859                  for (; /* ever */; ) {
3856 3860                          rctl_val_t oval;
3857 3861  
3858 3862                          mutex_enter(&pp->p_lock);
3859 3863                          error = rctl_local_get(hndl, NULL, &oval, pp);
3860 3864                          mutex_exit(&pp->p_lock);
3861 3865                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3862 3866                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3863 3867                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
3864 3868                                  break;
3865 3869                          mutex_enter(&pp->p_lock);
3866 3870                          error = rctl_local_delete(hndl, &oval, pp);
3867 3871                          mutex_exit(&pp->p_lock);
3868 3872                          ASSERT(error == 0);
3869 3873                  }
3870 3874                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3871 3875                  ASSERT(error == 0);
3872 3876                  for (i = 0; i < nelem; i++) {
3873 3877                          rctl_val_t *nvalp;
3874 3878  
3875 3879                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3876 3880                          error = nvlist2rctlval(nvlarray[i], nvalp);
3877 3881                          ASSERT(error == 0);
3878 3882                          /*
3879 3883                           * rctl_local_insert can fail if the value being
3880 3884                           * inserted is a duplicate; this is OK.
3881 3885                           */
3882 3886                          mutex_enter(&pp->p_lock);
3883 3887                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
3884 3888                                  kmem_cache_free(rctl_val_cache, nvalp);
3885 3889                          mutex_exit(&pp->p_lock);
3886 3890                  }
3887 3891          }
3888 3892          /*
3889 3893           * Tell the world that we're done setting up.
3890 3894           *
3891 3895           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3892 3896           * and atomically set the zone's processor set visibility.  Once
3893 3897           * we drop pool_lock() this zone will automatically get updated
3894 3898           * to reflect any future changes to the pools configuration.
3895 3899           *
3896 3900           * Note that after we drop the locks below (zonehash_lock in
3897 3901           * particular) other operations such as a zone_getattr call can
3898 3902           * now proceed and observe the zone. That is the reason for doing a
3899 3903           * state transition to the INITIALIZED state.
3900 3904           */
3901 3905          pool_lock();
3902 3906          mutex_enter(&cpu_lock);
3903 3907          mutex_enter(&zonehash_lock);
3904 3908          zone_uniqid(zone);
3905 3909          zone_zsd_configure(zone);
3906 3910          if (pool_state == POOL_ENABLED)
3907 3911                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
3908 3912          mutex_enter(&zone_status_lock);
3909 3913          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3910 3914          zone_status_set(zone, ZONE_IS_INITIALIZED);
3911 3915          mutex_exit(&zone_status_lock);
3912 3916          mutex_exit(&zonehash_lock);
3913 3917          mutex_exit(&cpu_lock);
3914 3918          pool_unlock();
3915 3919  
3916 3920          /* Now call the create callback for this key */
3917 3921          zsd_apply_all_keys(zsd_apply_create, zone);
3918 3922  
3919 3923          /* The callbacks are complete. Mark ZONE_IS_READY */
3920 3924          mutex_enter(&zone_status_lock);
3921 3925          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3922 3926          zone_status_set(zone, ZONE_IS_READY);
3923 3927          mutex_exit(&zone_status_lock);
3924 3928  
3925 3929          /*
3926 3930           * Once we see the zone transition to the ZONE_IS_BOOTING state,
3927 3931           * we launch init, and set the state to running.
3928 3932           */
3929 3933          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3930 3934  
3931 3935          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3932 3936                  id_t cid;
3933 3937  
3934 3938                  /*
3935 3939                   * Ok, this is a little complicated.  We need to grab the
3936 3940                   * zone's pool's scheduling class ID; note that by now, we
3937 3941                   * are already bound to a pool if we need to be (zoneadmd
3938 3942                   * will have done that to us while we're in the READY
3939 3943                   * state).  *But* the scheduling class for the zone's 'init'
3940 3944                   * must be explicitly passed to newproc, which doesn't
3941 3945                   * respect pool bindings.
3942 3946                   *
3943 3947                   * We hold the pool_lock across the call to newproc() to
3944 3948                   * close the obvious race: the pool's scheduling class
3945 3949                   * could change before we manage to create the LWP with
3946 3950                   * classid 'cid'.
3947 3951                   */
3948 3952                  pool_lock();
3949 3953                  if (zone->zone_defaultcid > 0)
3950 3954                          cid = zone->zone_defaultcid;
3951 3955                  else
3952 3956                          cid = pool_get_class(zone->zone_pool);
3953 3957                  if (cid == -1)
3954 3958                          cid = defaultcid;
3955 3959  
3956 3960                  /*
3957 3961                   * If this fails, zone_boot will ultimately fail.  The

↓ open down ↓

2061 lines elided

↑ open up ↑

3958 3962                   * state of the zone will be set to SHUTTING_DOWN-- userland
3959 3963                   * will have to tear down the zone, and fail, or try again.
3960 3964                   */
3961 3965                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3962 3966                      minclsyspri - 1, &ct, 0)) != 0) {
3963 3967                          mutex_enter(&zone_status_lock);
3964 3968                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3965 3969                          mutex_exit(&zone_status_lock);
3966 3970                  } else {
3967 3971                          zone->zone_boot_time = gethrestime_sec();
     3972 +                        zone->zone_boot_hrtime = gethrtime();
3968 3973                  }
3969 3974  
3970 3975                  pool_unlock();
3971 3976          }
3972 3977  
3973 3978          /*
3974 3979           * Wait for zone_destroy() to be called.  This is what we spend
3975 3980           * most of our life doing.
3976 3981           */
3977 3982          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");

3978 3983  
3979 3984          if (ct)
3980 3985                  /*
3981 3986                   * At this point the process contract should be empty.
3982 3987                   * (Though if it isn't, it's not the end of the world.)
3983 3988                   */
3984 3989                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3985 3990  
3986 3991          /*
3987 3992           * Allow kcred to be freed when all referring processes
3988 3993           * (including this one) go away.  We can't just do this in
3989 3994           * zone_free because we need to wait for the zone_cred_ref to
3990 3995           * drop to 0 before calling zone_free, and the existence of
3991 3996           * zone_kcred will prevent that.  Thus, we call crfree here to
3992 3997           * balance the crdup in zone_create.  The crhold calls earlier
3993 3998           * in zsched will be dropped when the thread and process exit.
3994 3999           */
3995 4000          crfree(zone->zone_kcred);
3996 4001          zone->zone_kcred = NULL;
3997 4002  
3998 4003          exit(CLD_EXITED, 0);
3999 4004  }
4000 4005  
4001 4006  /*
4002 4007   * Helper function to determine if there are any submounts of the
4003 4008   * provided path.  Used to make sure the zone doesn't "inherit" any
4004 4009   * mounts from before it is created.
4005 4010   */
4006 4011  static uint_t
4007 4012  zone_mount_count(const char *rootpath)
4008 4013  {
4009 4014          vfs_t *vfsp;
4010 4015          uint_t count = 0;
4011 4016          size_t rootpathlen = strlen(rootpath);
4012 4017  
4013 4018          /*
4014 4019           * Holding zonehash_lock prevents race conditions with
4015 4020           * vfs_list_add()/vfs_list_remove() since we serialize with
4016 4021           * zone_find_by_path().
4017 4022           */
4018 4023          ASSERT(MUTEX_HELD(&zonehash_lock));
4019 4024          /*
4020 4025           * The rootpath must end with a '/'
4021 4026           */
4022 4027          ASSERT(rootpath[rootpathlen - 1] == '/');
4023 4028  
4024 4029          /*
4025 4030           * This intentionally does not count the rootpath itself if that
4026 4031           * happens to be a mount point.
4027 4032           */
4028 4033          vfs_list_read_lock();
4029 4034          vfsp = rootvfs;
4030 4035          do {
4031 4036                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4032 4037                      rootpathlen) == 0)
4033 4038                          count++;
4034 4039                  vfsp = vfsp->vfs_next;
4035 4040          } while (vfsp != rootvfs);
4036 4041          vfs_list_unlock();
4037 4042          return (count);
4038 4043  }
4039 4044  
4040 4045  /*
4041 4046   * Helper function to make sure that a zone created on 'rootpath'
4042 4047   * wouldn't end up containing other zones' rootpaths.
4043 4048   */
4044 4049  static boolean_t
4045 4050  zone_is_nested(const char *rootpath)
4046 4051  {
4047 4052          zone_t *zone;
4048 4053          size_t rootpathlen = strlen(rootpath);
4049 4054          size_t len;
4050 4055  
4051 4056          ASSERT(MUTEX_HELD(&zonehash_lock));
4052 4057  
4053 4058          /*
4054 4059           * zone_set_root() appended '/' and '\0' at the end of rootpath
4055 4060           */
4056 4061          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4057 4062              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4058 4063                  return (B_TRUE);
4059 4064  
4060 4065          for (zone = list_head(&zone_active); zone != NULL;
4061 4066              zone = list_next(&zone_active, zone)) {
4062 4067                  if (zone == global_zone)
4063 4068                          continue;
4064 4069                  len = strlen(zone->zone_rootpath);
4065 4070                  if (strncmp(rootpath, zone->zone_rootpath,
4066 4071                      MIN(rootpathlen, len)) == 0)
4067 4072                          return (B_TRUE);
4068 4073          }
4069 4074          return (B_FALSE);
4070 4075  }
4071 4076  
4072 4077  static int
4073 4078  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4074 4079      size_t zone_privssz)
4075 4080  {
4076 4081          priv_set_t *privs;
4077 4082  
4078 4083          if (zone_privssz < sizeof (priv_set_t))
4079 4084                  return (ENOMEM);
4080 4085  
4081 4086          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4082 4087  
4083 4088          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4084 4089                  kmem_free(privs, sizeof (priv_set_t));
4085 4090                  return (EFAULT);
4086 4091          }
4087 4092  
4088 4093          zone->zone_privset = privs;
4089 4094          return (0);
4090 4095  }
4091 4096  
4092 4097  /*
4093 4098   * We make creative use of nvlists to pass in rctls from userland.  The list is
4094 4099   * a list of the following structures:
4095 4100   *
4096 4101   * (name = rctl_name, value = nvpair_list_array)
4097 4102   *
4098 4103   * Where each element of the nvpair_list_array is of the form:
4099 4104   *
4100 4105   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4101 4106   *      (name = "limit", value = uint64_t),
4102 4107   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4103 4108   */
4104 4109  static int
4105 4110  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4106 4111  {
4107 4112          nvpair_t *nvp = NULL;
4108 4113          nvlist_t *nvl = NULL;
4109 4114          char *kbuf;
4110 4115          int error;
4111 4116          rctl_val_t rv;
4112 4117  
4113 4118          *nvlp = NULL;
4114 4119  
4115 4120          if (buflen == 0)
4116 4121                  return (0);
4117 4122  
4118 4123          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4119 4124                  return (ENOMEM);
4120 4125          if (copyin(ubuf, kbuf, buflen)) {
4121 4126                  error = EFAULT;
4122 4127                  goto out;
4123 4128          }
4124 4129          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4125 4130                  /*
4126 4131                   * nvl may have been allocated/free'd, but the value set to
4127 4132                   * non-NULL, so we reset it here.
4128 4133                   */
4129 4134                  nvl = NULL;
4130 4135                  error = EINVAL;
4131 4136                  goto out;
4132 4137          }
4133 4138          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4134 4139                  rctl_dict_entry_t *rde;
4135 4140                  rctl_hndl_t hndl;
4136 4141                  nvlist_t **nvlarray;
4137 4142                  uint_t i, nelem;
4138 4143                  char *name;
4139 4144  
4140 4145                  error = EINVAL;
4141 4146                  name = nvpair_name(nvp);
4142 4147                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4143 4148                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4144 4149                          goto out;
4145 4150                  }
4146 4151                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4147 4152                          goto out;
4148 4153                  }
4149 4154                  rde = rctl_dict_lookup_hndl(hndl);
4150 4155                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4151 4156                  ASSERT(error == 0);
4152 4157                  for (i = 0; i < nelem; i++) {
4153 4158                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4154 4159                                  goto out;
4155 4160                  }
4156 4161                  if (rctl_invalid_value(rde, &rv)) {
4157 4162                          error = EINVAL;
4158 4163                          goto out;
4159 4164                  }
4160 4165          }
4161 4166          error = 0;
4162 4167          *nvlp = nvl;
4163 4168  out:
4164 4169          kmem_free(kbuf, buflen);
4165 4170          if (error && nvl != NULL)
4166 4171                  nvlist_free(nvl);
4167 4172          return (error);
4168 4173  }
4169 4174  
4170 4175  int
4171 4176  zone_create_error(int er_error, int er_ext, int *er_out) {
4172 4177          if (er_out != NULL) {
4173 4178                  if (copyout(&er_ext, er_out, sizeof (int))) {
4174 4179                          return (set_errno(EFAULT));
4175 4180                  }
4176 4181          }
4177 4182          return (set_errno(er_error));
4178 4183  }
4179 4184  
4180 4185  static int
4181 4186  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4182 4187  {
4183 4188          ts_label_t *tsl;
4184 4189          bslabel_t blab;
4185 4190  
4186 4191          /* Get label from user */
4187 4192          if (copyin(lab, &blab, sizeof (blab)) != 0)
4188 4193                  return (EFAULT);
4189 4194          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4190 4195          if (tsl == NULL)
4191 4196                  return (ENOMEM);
4192 4197  
4193 4198          zone->zone_slabel = tsl;
4194 4199          return (0);
4195 4200  }
4196 4201  
4197 4202  /*
4198 4203   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4199 4204   */
4200 4205  static int
4201 4206  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4202 4207  {
4203 4208          char *kbuf;
4204 4209          char *dataset, *next;
4205 4210          zone_dataset_t *zd;
4206 4211          size_t len;
4207 4212  
4208 4213          if (ubuf == NULL || buflen == 0)
4209 4214                  return (0);
4210 4215  
4211 4216          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4212 4217                  return (ENOMEM);
4213 4218  
4214 4219          if (copyin(ubuf, kbuf, buflen) != 0) {
4215 4220                  kmem_free(kbuf, buflen);
4216 4221                  return (EFAULT);
4217 4222          }
4218 4223  
4219 4224          dataset = next = kbuf;
4220 4225          for (;;) {
4221 4226                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4222 4227  
4223 4228                  next = strchr(dataset, ',');
4224 4229  
4225 4230                  if (next == NULL)
4226 4231                          len = strlen(dataset);
4227 4232                  else
4228 4233                          len = next - dataset;
4229 4234  
4230 4235                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4231 4236                  bcopy(dataset, zd->zd_dataset, len);
4232 4237                  zd->zd_dataset[len] = '\0';
4233 4238  
4234 4239                  list_insert_head(&zone->zone_datasets, zd);
4235 4240  
4236 4241                  if (next == NULL)
4237 4242                          break;
4238 4243  
4239 4244                  dataset = next + 1;
4240 4245          }
4241 4246  
4242 4247          kmem_free(kbuf, buflen);
4243 4248          return (0);
4244 4249  }
4245 4250  
4246 4251  /*
4247 4252   * System call to create/initialize a new zone named 'zone_name', rooted
4248 4253   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4249 4254   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4250 4255   * with labeling set by 'match', 'doi', and 'label'.
4251 4256   *
4252 4257   * If extended error is non-null, we may use it to return more detailed
4253 4258   * error information.
4254 4259   */
4255 4260  static zoneid_t
4256 4261  zone_create(const char *zone_name, const char *zone_root,
4257 4262      const priv_set_t *zone_privs, size_t zone_privssz,
4258 4263      caddr_t rctlbuf, size_t rctlbufsz,
4259 4264      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4260 4265      int match, uint32_t doi, const bslabel_t *label,
4261 4266      int flags)
4262 4267  {
4263 4268          struct zsched_arg zarg;
4264 4269          nvlist_t *rctls = NULL;
4265 4270          proc_t *pp = curproc;
4266 4271          zone_t *zone, *ztmp;
4267 4272          zoneid_t zoneid;
4268 4273          int error;
4269 4274          int error2 = 0;
4270 4275          char *str;
4271 4276          cred_t *zkcr;
4272 4277          boolean_t insert_label_hash;
4273 4278  
4274 4279          if (secpolicy_zone_config(CRED()) != 0)
4275 4280                  return (set_errno(EPERM));
4276 4281  
4277 4282          /* can't boot zone from within chroot environment */
4278 4283          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4279 4284                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4280 4285                      extended_error));
4281 4286  
4282 4287          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4283 4288          zoneid = zone->zone_id = id_alloc(zoneid_space);
4284 4289          zone->zone_status = ZONE_IS_UNINITIALIZED;
4285 4290          zone->zone_pool = pool_default;
4286 4291          zone->zone_pool_mod = gethrtime();
4287 4292          zone->zone_psetid = ZONE_PS_INVAL;
4288 4293          zone->zone_ncpus = 0;
4289 4294          zone->zone_ncpus_online = 0;
4290 4295          zone->zone_restart_init = B_TRUE;
4291 4296          zone->zone_brand = &native_brand;
4292 4297          zone->zone_initname = NULL;
4293 4298          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4294 4299          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4295 4300          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4296 4301          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4297 4302          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4298 4303              offsetof(zone_ref_t, zref_linkage));
4299 4304          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4300 4305              offsetof(struct zsd_entry, zsd_linkage));
4301 4306          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4302 4307              offsetof(zone_dataset_t, zd_linkage));
4303 4308          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4304 4309              offsetof(zone_dl_t, zdl_linkage));
4305 4310          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4306 4311          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4307 4312  
4308 4313          if (flags & ZCF_NET_EXCL) {
4309 4314                  zone->zone_flags |= ZF_NET_EXCL;
4310 4315          }
4311 4316  
4312 4317          if ((error = zone_set_name(zone, zone_name)) != 0) {
4313 4318                  zone_free(zone);
4314 4319                  return (zone_create_error(error, 0, extended_error));
4315 4320          }
4316 4321  
4317 4322          if ((error = zone_set_root(zone, zone_root)) != 0) {
4318 4323                  zone_free(zone);
4319 4324                  return (zone_create_error(error, 0, extended_error));
4320 4325          }
4321 4326          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4322 4327                  zone_free(zone);
4323 4328                  return (zone_create_error(error, 0, extended_error));
4324 4329          }
4325 4330  
4326 4331          /* initialize node name to be the same as zone name */
4327 4332          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4328 4333          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4329 4334          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4330 4335  
4331 4336          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4332 4337          zone->zone_domain[0] = '\0';
4333 4338          zone->zone_hostid = HW_INVALID_HOSTID;
4334 4339          zone->zone_shares = 1;
4335 4340          zone->zone_shmmax = 0;
4336 4341          zone->zone_ipc.ipcq_shmmni = 0;
4337 4342          zone->zone_ipc.ipcq_semmni = 0;
4338 4343          zone->zone_ipc.ipcq_msgmni = 0;
4339 4344          zone->zone_bootargs = NULL;
4340 4345          zone->zone_fs_allowed = NULL;
4341 4346          zone->zone_initname =
4342 4347              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4343 4348          (void) strcpy(zone->zone_initname, zone_default_initname);
4344 4349          zone->zone_nlwps = 0;
4345 4350          zone->zone_nlwps_ctl = INT_MAX;
4346 4351          zone->zone_nprocs = 0;
4347 4352          zone->zone_nprocs_ctl = INT_MAX;
4348 4353          zone->zone_locked_mem = 0;
4349 4354          zone->zone_locked_mem_ctl = UINT64_MAX;
4350 4355          zone->zone_max_swap = 0;
4351 4356          zone->zone_max_swap_ctl = UINT64_MAX;
4352 4357          zone->zone_max_lofi = 0;
4353 4358          zone->zone_max_lofi_ctl = UINT64_MAX;
4354 4359          zone0.zone_lockedmem_kstat = NULL;
4355 4360          zone0.zone_swapresv_kstat = NULL;
4356 4361  
4357 4362          /*
4358 4363           * Zsched initializes the rctls.
4359 4364           */
4360 4365          zone->zone_rctls = NULL;
4361 4366  
4362 4367          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4363 4368                  zone_free(zone);
4364 4369                  return (zone_create_error(error, 0, extended_error));
4365 4370          }
4366 4371  
4367 4372          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4368 4373                  zone_free(zone);
4369 4374                  return (set_errno(error));
4370 4375          }
4371 4376  
4372 4377          /*
4373 4378           * Read in the trusted system parameters:
4374 4379           * match flag and sensitivity label.
4375 4380           */
4376 4381          zone->zone_match = match;
4377 4382          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4378 4383                  /* Fail if requested to set doi to anything but system's doi */
4379 4384                  if (doi != 0 && doi != default_doi) {
4380 4385                          zone_free(zone);
4381 4386                          return (set_errno(EINVAL));
4382 4387                  }
4383 4388                  /* Always apply system's doi to the zone */
4384 4389                  error = zone_set_label(zone, label, default_doi);
4385 4390                  if (error != 0) {
4386 4391                          zone_free(zone);
4387 4392                          return (set_errno(error));
4388 4393                  }
4389 4394                  insert_label_hash = B_TRUE;
4390 4395          } else {
4391 4396                  /* all zones get an admin_low label if system is not labeled */
4392 4397                  zone->zone_slabel = l_admin_low;
4393 4398                  label_hold(l_admin_low);
4394 4399                  insert_label_hash = B_FALSE;
4395 4400          }
4396 4401  
4397 4402          /*
4398 4403           * Stop all lwps since that's what normally happens as part of fork().
4399 4404           * This needs to happen before we grab any locks to avoid deadlock
4400 4405           * (another lwp in the process could be waiting for the held lock).
4401 4406           */
4402 4407          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4403 4408                  zone_free(zone);
4404 4409                  if (rctls)
4405 4410                          nvlist_free(rctls);
4406 4411                  return (zone_create_error(error, 0, extended_error));
4407 4412          }
4408 4413  
4409 4414          if (block_mounts(zone) == 0) {
4410 4415                  mutex_enter(&pp->p_lock);
4411 4416                  if (curthread != pp->p_agenttp)
4412 4417                          continuelwps(pp);
4413 4418                  mutex_exit(&pp->p_lock);
4414 4419                  zone_free(zone);
4415 4420                  if (rctls)
4416 4421                          nvlist_free(rctls);
4417 4422                  return (zone_create_error(error, 0, extended_error));
4418 4423          }
4419 4424  
4420 4425          /*
4421 4426           * Set up credential for kernel access.  After this, any errors
4422 4427           * should go through the dance in errout rather than calling
4423 4428           * zone_free directly.
4424 4429           */
4425 4430          zone->zone_kcred = crdup(kcred);
4426 4431          crsetzone(zone->zone_kcred, zone);
4427 4432          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4428 4433          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4429 4434          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4430 4435          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4431 4436  
4432 4437          mutex_enter(&zonehash_lock);
4433 4438          /*
4434 4439           * Make sure zone doesn't already exist.
4435 4440           *
4436 4441           * If the system and zone are labeled,
4437 4442           * make sure no other zone exists that has the same label.
4438 4443           */
4439 4444          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4440 4445              (insert_label_hash &&
4441 4446              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4442 4447                  zone_status_t status;
4443 4448  
4444 4449                  status = zone_status_get(ztmp);
4445 4450                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4446 4451                          error = EEXIST;
4447 4452                  else
4448 4453                          error = EBUSY;
4449 4454  
4450 4455                  if (insert_label_hash)
4451 4456                          error2 = ZE_LABELINUSE;
4452 4457  
4453 4458                  goto errout;
4454 4459          }
4455 4460  
4456 4461          /*
4457 4462           * Don't allow zone creations which would cause one zone's rootpath to
4458 4463           * be accessible from that of another (non-global) zone.
4459 4464           */
4460 4465          if (zone_is_nested(zone->zone_rootpath)) {
4461 4466                  error = EBUSY;
4462 4467                  goto errout;
4463 4468          }
4464 4469  
4465 4470          ASSERT(zonecount != 0);         /* check for leaks */
4466 4471          if (zonecount + 1 > maxzones) {
4467 4472                  error = ENOMEM;
4468 4473                  goto errout;
4469 4474          }
4470 4475  
4471 4476          if (zone_mount_count(zone->zone_rootpath) != 0) {
4472 4477                  error = EBUSY;
4473 4478                  error2 = ZE_AREMOUNTS;
4474 4479                  goto errout;
4475 4480          }
4476 4481  
4477 4482          /*
4478 4483           * Zone is still incomplete, but we need to drop all locks while
4479 4484           * zsched() initializes this zone's kernel process.  We
4480 4485           * optimistically add the zone to the hashtable and associated
4481 4486           * lists so a parallel zone_create() doesn't try to create the
4482 4487           * same zone.
4483 4488           */
4484 4489          zonecount++;
4485 4490          (void) mod_hash_insert(zonehashbyid,
4486 4491              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4487 4492              (mod_hash_val_t)(uintptr_t)zone);
4488 4493          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4489 4494          (void) strcpy(str, zone->zone_name);
4490 4495          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4491 4496              (mod_hash_val_t)(uintptr_t)zone);
4492 4497          if (insert_label_hash) {
4493 4498                  (void) mod_hash_insert(zonehashbylabel,
4494 4499                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4495 4500                  zone->zone_flags |= ZF_HASHED_LABEL;
4496 4501          }
4497 4502  
4498 4503          /*
4499 4504           * Insert into active list.  At this point there are no 'hold's
4500 4505           * on the zone, but everyone else knows not to use it, so we can
4501 4506           * continue to use it.  zsched() will do a zone_hold() if the
4502 4507           * newproc() is successful.
4503 4508           */
4504 4509          list_insert_tail(&zone_active, zone);
4505 4510          mutex_exit(&zonehash_lock);
4506 4511  
4507 4512          zarg.zone = zone;
4508 4513          zarg.nvlist = rctls;
4509 4514          /*
4510 4515           * The process, task, and project rctls are probably wrong;
4511 4516           * we need an interface to get the default values of all rctls,
4512 4517           * and initialize zsched appropriately.  I'm not sure that that
4513 4518           * makes much of a difference, though.
4514 4519           */
4515 4520          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4516 4521          if (error != 0) {
4517 4522                  /*
4518 4523                   * We need to undo all globally visible state.
4519 4524                   */
4520 4525                  mutex_enter(&zonehash_lock);
4521 4526                  list_remove(&zone_active, zone);
4522 4527                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4523 4528                          ASSERT(zone->zone_slabel != NULL);
4524 4529                          (void) mod_hash_destroy(zonehashbylabel,
4525 4530                              (mod_hash_key_t)zone->zone_slabel);
4526 4531                  }
4527 4532                  (void) mod_hash_destroy(zonehashbyname,
4528 4533                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4529 4534                  (void) mod_hash_destroy(zonehashbyid,
4530 4535                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4531 4536                  ASSERT(zonecount > 1);
4532 4537                  zonecount--;
4533 4538                  goto errout;
4534 4539          }
4535 4540  
4536 4541          /*
4537 4542           * Zone creation can't fail from now on.
4538 4543           */
4539 4544  
4540 4545          /*
4541 4546           * Create zone kstats
4542 4547           */
4543 4548          zone_kstat_create(zone);
4544 4549  
4545 4550          /*
4546 4551           * Let the other lwps continue.
4547 4552           */
4548 4553          mutex_enter(&pp->p_lock);
4549 4554          if (curthread != pp->p_agenttp)
4550 4555                  continuelwps(pp);
4551 4556          mutex_exit(&pp->p_lock);
4552 4557  
4553 4558          /*
4554 4559           * Wait for zsched to finish initializing the zone.
4555 4560           */
4556 4561          zone_status_wait(zone, ZONE_IS_READY);
4557 4562          /*
4558 4563           * The zone is fully visible, so we can let mounts progress.
4559 4564           */
4560 4565          resume_mounts(zone);
4561 4566          if (rctls)
4562 4567                  nvlist_free(rctls);
4563 4568  
4564 4569          return (zoneid);
4565 4570  
4566 4571  errout:
4567 4572          mutex_exit(&zonehash_lock);
4568 4573          /*
4569 4574           * Let the other lwps continue.
4570 4575           */
4571 4576          mutex_enter(&pp->p_lock);
4572 4577          if (curthread != pp->p_agenttp)
4573 4578                  continuelwps(pp);
4574 4579          mutex_exit(&pp->p_lock);
4575 4580  
4576 4581          resume_mounts(zone);
4577 4582          if (rctls)
4578 4583                  nvlist_free(rctls);
4579 4584          /*
4580 4585           * There is currently one reference to the zone, a cred_ref from
4581 4586           * zone_kcred.  To free the zone, we call crfree, which will call
4582 4587           * zone_cred_rele, which will call zone_free.
4583 4588           */
4584 4589          ASSERT(zone->zone_cred_ref == 1);
4585 4590          ASSERT(zone->zone_kcred->cr_ref == 1);
4586 4591          ASSERT(zone->zone_ref == 0);
4587 4592          zkcr = zone->zone_kcred;
4588 4593          zone->zone_kcred = NULL;
4589 4594          crfree(zkcr);                           /* triggers call to zone_free */
4590 4595          return (zone_create_error(error, error2, extended_error));
4591 4596  }
4592 4597  
4593 4598  /*
4594 4599   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4595 4600   * the heavy lifting.  initname is the path to the program to launch
4596 4601   * at the "top" of the zone; if this is NULL, we use the system default,
4597 4602   * which is stored at zone_default_initname.
4598 4603   */
4599 4604  static int
4600 4605  zone_boot(zoneid_t zoneid)
4601 4606  {
4602 4607          int err;
4603 4608          zone_t *zone;
4604 4609  
4605 4610          if (secpolicy_zone_config(CRED()) != 0)
4606 4611                  return (set_errno(EPERM));
4607 4612          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4608 4613                  return (set_errno(EINVAL));
4609 4614  
4610 4615          mutex_enter(&zonehash_lock);
4611 4616          /*
4612 4617           * Look for zone under hash lock to prevent races with calls to
4613 4618           * zone_shutdown, zone_destroy, etc.
4614 4619           */
4615 4620          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4616 4621                  mutex_exit(&zonehash_lock);
4617 4622                  return (set_errno(EINVAL));
4618 4623          }
4619 4624  
4620 4625          mutex_enter(&zone_status_lock);
4621 4626          if (zone_status_get(zone) != ZONE_IS_READY) {
4622 4627                  mutex_exit(&zone_status_lock);
4623 4628                  mutex_exit(&zonehash_lock);
4624 4629                  return (set_errno(EINVAL));
4625 4630          }
4626 4631          zone_status_set(zone, ZONE_IS_BOOTING);
4627 4632          mutex_exit(&zone_status_lock);
4628 4633  
4629 4634          zone_hold(zone);        /* so we can use the zone_t later */
4630 4635          mutex_exit(&zonehash_lock);
4631 4636  
4632 4637          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4633 4638                  zone_rele(zone);
4634 4639                  return (set_errno(EINTR));
4635 4640          }
4636 4641  
4637 4642          /*
4638 4643           * Boot (starting init) might have failed, in which case the zone
4639 4644           * will go to the SHUTTING_DOWN state; an appropriate errno will
4640 4645           * be placed in zone->zone_boot_err, and so we return that.
4641 4646           */
4642 4647          err = zone->zone_boot_err;
4643 4648          zone_rele(zone);
4644 4649          return (err ? set_errno(err) : 0);
4645 4650  }
4646 4651  
4647 4652  /*
4648 4653   * Kills all user processes in the zone, waiting for them all to exit
4649 4654   * before returning.
4650 4655   */
4651 4656  static int
4652 4657  zone_empty(zone_t *zone)
4653 4658  {
4654 4659          int waitstatus;
4655 4660  
4656 4661          /*
4657 4662           * We need to drop zonehash_lock before killing all
4658 4663           * processes, otherwise we'll deadlock with zone_find_*
4659 4664           * which can be called from the exit path.
4660 4665           */
4661 4666          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4662 4667          while ((waitstatus = zone_status_timedwait_sig(zone,
4663 4668              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4664 4669                  killall(zone->zone_id);
4665 4670          }
4666 4671          /*
4667 4672           * return EINTR if we were signaled
4668 4673           */
4669 4674          if (waitstatus == 0)
4670 4675                  return (EINTR);
4671 4676          return (0);
4672 4677  }
4673 4678  
4674 4679  /*
4675 4680   * This function implements the policy for zone visibility.
4676 4681   *
4677 4682   * In standard Solaris, a non-global zone can only see itself.
4678 4683   *
4679 4684   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4680 4685   * it dominates. For this test, the label of the global zone is treated as
4681 4686   * admin_high so it is special-cased instead of being checked for dominance.
4682 4687   *
4683 4688   * Returns true if zone attributes are viewable, false otherwise.
4684 4689   */
4685 4690  static boolean_t
4686 4691  zone_list_access(zone_t *zone)
4687 4692  {
4688 4693  
4689 4694          if (curproc->p_zone == global_zone ||
4690 4695              curproc->p_zone == zone) {
4691 4696                  return (B_TRUE);
4692 4697          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4693 4698                  bslabel_t *curproc_label;
4694 4699                  bslabel_t *zone_label;
4695 4700  
4696 4701                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4697 4702                  zone_label = label2bslabel(zone->zone_slabel);
4698 4703  
4699 4704                  if (zone->zone_id != GLOBAL_ZONEID &&
4700 4705                      bldominates(curproc_label, zone_label)) {
4701 4706                          return (B_TRUE);
4702 4707                  } else {
4703 4708                          return (B_FALSE);
4704 4709                  }
4705 4710          } else {
4706 4711                  return (B_FALSE);
4707 4712          }
4708 4713  }
4709 4714  
4710 4715  /*
4711 4716   * Systemcall to start the zone's halt sequence.  By the time this
4712 4717   * function successfully returns, all user processes and kernel threads
4713 4718   * executing in it will have exited, ZSD shutdown callbacks executed,
4714 4719   * and the zone status set to ZONE_IS_DOWN.
4715 4720   *
4716 4721   * It is possible that the call will interrupt itself if the caller is the
4717 4722   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4718 4723   */
4719 4724  static int
4720 4725  zone_shutdown(zoneid_t zoneid)
4721 4726  {
4722 4727          int error;
4723 4728          zone_t *zone;
4724 4729          zone_status_t status;
4725 4730  
4726 4731          if (secpolicy_zone_config(CRED()) != 0)
4727 4732                  return (set_errno(EPERM));
4728 4733          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4729 4734                  return (set_errno(EINVAL));
4730 4735  
4731 4736          mutex_enter(&zonehash_lock);
4732 4737          /*
4733 4738           * Look for zone under hash lock to prevent races with other
4734 4739           * calls to zone_shutdown and zone_destroy.
4735 4740           */
4736 4741          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4737 4742                  mutex_exit(&zonehash_lock);
4738 4743                  return (set_errno(EINVAL));
4739 4744          }
4740 4745  
4741 4746          /*
4742 4747           * We have to drop zonehash_lock before calling block_mounts.
4743 4748           * Hold the zone so we can continue to use the zone_t.
4744 4749           */
4745 4750          zone_hold(zone);
4746 4751          mutex_exit(&zonehash_lock);
4747 4752  
4748 4753          /*
4749 4754           * Block mounts so that VFS_MOUNT() can get an accurate view of
4750 4755           * the zone's status with regards to ZONE_IS_SHUTTING down.
4751 4756           *
4752 4757           * e.g. NFS can fail the mount if it determines that the zone
4753 4758           * has already begun the shutdown sequence.
4754 4759           *
4755 4760           */
4756 4761          if (block_mounts(zone) == 0) {
4757 4762                  zone_rele(zone);
4758 4763                  return (set_errno(EINTR));
4759 4764          }
4760 4765  
4761 4766          mutex_enter(&zonehash_lock);
4762 4767          mutex_enter(&zone_status_lock);
4763 4768          status = zone_status_get(zone);
4764 4769          /*
4765 4770           * Fail if the zone isn't fully initialized yet.
4766 4771           */
4767 4772          if (status < ZONE_IS_READY) {
4768 4773                  mutex_exit(&zone_status_lock);
4769 4774                  mutex_exit(&zonehash_lock);
4770 4775                  resume_mounts(zone);
4771 4776                  zone_rele(zone);
4772 4777                  return (set_errno(EINVAL));
4773 4778          }
4774 4779          /*
4775 4780           * If conditions required for zone_shutdown() to return have been met,
4776 4781           * return success.
4777 4782           */
4778 4783          if (status >= ZONE_IS_DOWN) {
4779 4784                  mutex_exit(&zone_status_lock);
4780 4785                  mutex_exit(&zonehash_lock);
4781 4786                  resume_mounts(zone);
4782 4787                  zone_rele(zone);
4783 4788                  return (0);
4784 4789          }
4785 4790          /*
4786 4791           * If zone_shutdown() hasn't been called before, go through the motions.
4787 4792           * If it has, there's nothing to do but wait for the kernel threads to
4788 4793           * drain.
4789 4794           */
4790 4795          if (status < ZONE_IS_EMPTY) {
4791 4796                  uint_t ntasks;
4792 4797  
4793 4798                  mutex_enter(&zone->zone_lock);
4794 4799                  if ((ntasks = zone->zone_ntasks) != 1) {
4795 4800                          /*
4796 4801                           * There's still stuff running.
4797 4802                           */
4798 4803                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4799 4804                  }
4800 4805                  mutex_exit(&zone->zone_lock);
4801 4806                  if (ntasks == 1) {
4802 4807                          /*
4803 4808                           * The only way to create another task is through
4804 4809                           * zone_enter(), which will block until we drop
4805 4810                           * zonehash_lock.  The zone is empty.
4806 4811                           */
4807 4812                          if (zone->zone_kthreads == NULL) {
4808 4813                                  /*
4809 4814                                   * Skip ahead to ZONE_IS_DOWN
4810 4815                                   */
4811 4816                                  zone_status_set(zone, ZONE_IS_DOWN);
4812 4817                          } else {
4813 4818                                  zone_status_set(zone, ZONE_IS_EMPTY);
4814 4819                          }
4815 4820                  }
4816 4821          }
4817 4822          mutex_exit(&zone_status_lock);
4818 4823          mutex_exit(&zonehash_lock);
4819 4824          resume_mounts(zone);
4820 4825  
4821 4826          if (error = zone_empty(zone)) {
4822 4827                  zone_rele(zone);
4823 4828                  return (set_errno(error));
4824 4829          }
4825 4830          /*
4826 4831           * After the zone status goes to ZONE_IS_DOWN this zone will no
4827 4832           * longer be notified of changes to the pools configuration, so
4828 4833           * in order to not end up with a stale pool pointer, we point
4829 4834           * ourselves at the default pool and remove all resource
4830 4835           * visibility.  This is especially important as the zone_t may
4831 4836           * languish on the deathrow for a very long time waiting for
4832 4837           * cred's to drain out.
4833 4838           *
4834 4839           * This rebinding of the zone can happen multiple times
4835 4840           * (presumably due to interrupted or parallel systemcalls)
4836 4841           * without any adverse effects.
4837 4842           */
4838 4843          if (pool_lock_intr() != 0) {
4839 4844                  zone_rele(zone);
4840 4845                  return (set_errno(EINTR));
4841 4846          }
4842 4847          if (pool_state == POOL_ENABLED) {
4843 4848                  mutex_enter(&cpu_lock);
4844 4849                  zone_pool_set(zone, pool_default);
4845 4850                  /*
4846 4851                   * The zone no longer needs to be able to see any cpus.
4847 4852                   */
4848 4853                  zone_pset_set(zone, ZONE_PS_INVAL);
4849 4854                  mutex_exit(&cpu_lock);
4850 4855          }
4851 4856          pool_unlock();
4852 4857  
4853 4858          /*
4854 4859           * ZSD shutdown callbacks can be executed multiple times, hence
4855 4860           * it is safe to not be holding any locks across this call.
4856 4861           */
4857 4862          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4858 4863  
4859 4864          mutex_enter(&zone_status_lock);
4860 4865          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4861 4866                  zone_status_set(zone, ZONE_IS_DOWN);
4862 4867          mutex_exit(&zone_status_lock);
4863 4868  
4864 4869          /*
4865 4870           * Wait for kernel threads to drain.
4866 4871           */
4867 4872          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4868 4873                  zone_rele(zone);
4869 4874                  return (set_errno(EINTR));
4870 4875          }
4871 4876  
4872 4877          /*
4873 4878           * Zone can be become down/destroyable even if the above wait
4874 4879           * returns EINTR, so any code added here may never execute.
4875 4880           * (i.e. don't add code here)
4876 4881           */
4877 4882  
4878 4883          zone_rele(zone);
4879 4884          return (0);
4880 4885  }
4881 4886  
4882 4887  /*
4883 4888   * Log the specified zone's reference counts.  The caller should not be
4884 4889   * holding the zone's zone_lock.
4885 4890   */
4886 4891  static void
4887 4892  zone_log_refcounts(zone_t *zone)
4888 4893  {
4889 4894          char *buffer;
4890 4895          char *buffer_position;
4891 4896          uint32_t buffer_size;
4892 4897          uint32_t index;
4893 4898          uint_t ref;
4894 4899          uint_t cred_ref;
4895 4900  
4896 4901          /*
4897 4902           * Construct a string representing the subsystem-specific reference
4898 4903           * counts.  The counts are printed in ascending order by index into the
4899 4904           * zone_t::zone_subsys_ref array.  The list will be surrounded by
4900 4905           * square brackets [] and will only contain nonzero reference counts.
4901 4906           *
4902 4907           * The buffer will hold two square bracket characters plus ten digits,
4903 4908           * one colon, one space, one comma, and some characters for a
4904 4909           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4905 4910           * bit integers have at most ten decimal digits.)  The last
4906 4911           * reference count's comma is replaced by the closing square
4907 4912           * bracket and a NULL character to terminate the string.
4908 4913           *
4909 4914           * NOTE: We have to grab the zone's zone_lock to create a consistent
4910 4915           * snapshot of the zone's reference counters.
4911 4916           *
4912 4917           * First, figure out how much space the string buffer will need.
4913 4918           * The buffer's size is stored in buffer_size.
4914 4919           */
4915 4920          buffer_size = 2;                        /* for the square brackets */
4916 4921          mutex_enter(&zone->zone_lock);
4917 4922          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4918 4923          ref = zone->zone_ref;
4919 4924          cred_ref = zone->zone_cred_ref;
4920 4925          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4921 4926                  if (zone->zone_subsys_ref[index] != 0)
4922 4927                          buffer_size += strlen(zone_ref_subsys_names[index]) +
4923 4928                              13;
4924 4929          if (buffer_size == 2) {
4925 4930                  /*
4926 4931                   * No subsystems had nonzero reference counts.  Don't bother
4927 4932                   * with allocating a buffer; just log the general-purpose and
4928 4933                   * credential reference counts.
4929 4934                   */
4930 4935                  mutex_exit(&zone->zone_lock);
4931 4936                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4932 4937                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
4933 4938                      "references and %u credential references are still extant",
4934 4939                      zone->zone_name, zone->zone_id, ref, cred_ref);
4935 4940                  return;
4936 4941          }
4937 4942  
4938 4943          /*
4939 4944           * buffer_size contains the exact number of characters that the
4940 4945           * buffer will need.  Allocate the buffer and fill it with nonzero
4941 4946           * subsystem-specific reference counts.  Surround the results with
4942 4947           * square brackets afterwards.
4943 4948           */
4944 4949          buffer = kmem_alloc(buffer_size, KM_SLEEP);
4945 4950          buffer_position = &buffer[1];
4946 4951          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4947 4952                  /*
4948 4953                   * NOTE: The DDI's version of sprintf() returns a pointer to
4949 4954                   * the modified buffer rather than the number of bytes written
4950 4955                   * (as in snprintf(3C)).  This is unfortunate and annoying.
4951 4956                   * Therefore, we'll use snprintf() with INT_MAX to get the
4952 4957                   * number of bytes written.  Using INT_MAX is safe because
4953 4958                   * the buffer is perfectly sized for the data: we'll never
4954 4959                   * overrun the buffer.
4955 4960                   */
4956 4961                  if (zone->zone_subsys_ref[index] != 0)
4957 4962                          buffer_position += snprintf(buffer_position, INT_MAX,
4958 4963                              "%s: %u,", zone_ref_subsys_names[index],
4959 4964                              zone->zone_subsys_ref[index]);
4960 4965          }
4961 4966          mutex_exit(&zone->zone_lock);
4962 4967          buffer[0] = '[';
4963 4968          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4964 4969          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4965 4970          buffer_position[-1] = ']';
4966 4971  
4967 4972          /*
4968 4973           * Log the reference counts and free the message buffer.
4969 4974           */
4970 4975          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4971 4976              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4972 4977              "%u credential references are still extant %s", zone->zone_name,
4973 4978              zone->zone_id, ref, cred_ref, buffer);
4974 4979          kmem_free(buffer, buffer_size);
4975 4980  }
4976 4981  
4977 4982  /*
4978 4983   * Systemcall entry point to finalize the zone halt process.  The caller
4979 4984   * must have already successfully called zone_shutdown().
4980 4985   *
4981 4986   * Upon successful completion, the zone will have been fully destroyed:
4982 4987   * zsched will have exited, destructor callbacks executed, and the zone
4983 4988   * removed from the list of active zones.
4984 4989   */
4985 4990  static int
4986 4991  zone_destroy(zoneid_t zoneid)
4987 4992  {
4988 4993          uint64_t uniqid;
4989 4994          zone_t *zone;
4990 4995          zone_status_t status;
4991 4996          clock_t wait_time;
4992 4997          boolean_t log_refcounts;
4993 4998  
4994 4999          if (secpolicy_zone_config(CRED()) != 0)
4995 5000                  return (set_errno(EPERM));
4996 5001          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4997 5002                  return (set_errno(EINVAL));
4998 5003  
4999 5004          mutex_enter(&zonehash_lock);
5000 5005          /*
5001 5006           * Look for zone under hash lock to prevent races with other
5002 5007           * calls to zone_destroy.
5003 5008           */
5004 5009          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5005 5010                  mutex_exit(&zonehash_lock);
5006 5011                  return (set_errno(EINVAL));
5007 5012          }
5008 5013  
5009 5014          if (zone_mount_count(zone->zone_rootpath) != 0) {
5010 5015                  mutex_exit(&zonehash_lock);
5011 5016                  return (set_errno(EBUSY));
5012 5017          }
5013 5018          mutex_enter(&zone_status_lock);
5014 5019          status = zone_status_get(zone);
5015 5020          if (status < ZONE_IS_DOWN) {
5016 5021                  mutex_exit(&zone_status_lock);
5017 5022                  mutex_exit(&zonehash_lock);
5018 5023                  return (set_errno(EBUSY));
5019 5024          } else if (status == ZONE_IS_DOWN) {
5020 5025                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5021 5026          }
5022 5027          mutex_exit(&zone_status_lock);
5023 5028          zone_hold(zone);
5024 5029          mutex_exit(&zonehash_lock);
5025 5030  
5026 5031          /*
5027 5032           * wait for zsched to exit
5028 5033           */
5029 5034          zone_status_wait(zone, ZONE_IS_DEAD);
5030 5035          zone_zsd_callbacks(zone, ZSD_DESTROY);
5031 5036          zone->zone_netstack = NULL;
5032 5037          uniqid = zone->zone_uniqid;
5033 5038          zone_rele(zone);
5034 5039          zone = NULL;    /* potentially free'd */
5035 5040  
5036 5041          log_refcounts = B_FALSE;
5037 5042          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5038 5043          mutex_enter(&zonehash_lock);
5039 5044          for (; /* ever */; ) {
5040 5045                  boolean_t unref;
5041 5046                  boolean_t refs_have_been_logged;
5042 5047  
5043 5048                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5044 5049                      zone->zone_uniqid != uniqid) {
5045 5050                          /*
5046 5051                           * The zone has gone away.  Necessary conditions
5047 5052                           * are met, so we return success.
5048 5053                           */
5049 5054                          mutex_exit(&zonehash_lock);
5050 5055                          return (0);
5051 5056                  }
5052 5057                  mutex_enter(&zone->zone_lock);
5053 5058                  unref = ZONE_IS_UNREF(zone);
5054 5059                  refs_have_been_logged = (zone->zone_flags &
5055 5060                      ZF_REFCOUNTS_LOGGED);
5056 5061                  mutex_exit(&zone->zone_lock);
5057 5062                  if (unref) {
5058 5063                          /*
5059 5064                           * There is only one reference to the zone -- that
5060 5065                           * added when the zone was added to the hashtables --
5061 5066                           * and things will remain this way until we drop
5062 5067                           * zonehash_lock... we can go ahead and cleanup the
5063 5068                           * zone.
5064 5069                           */
5065 5070                          break;
5066 5071                  }
5067 5072  
5068 5073                  /*
5069 5074                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5070 5075                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5071 5076                   * some zone's general-purpose reference count reaches one.
5072 5077                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5073 5078                   * on zone_destroy_cv, then log the zone's reference counts and
5074 5079                   * continue to wait for zone_rele() and zone_cred_rele().
5075 5080                   */
5076 5081                  if (!refs_have_been_logged) {
5077 5082                          if (!log_refcounts) {
5078 5083                                  /*
5079 5084                                   * This thread hasn't timed out waiting on
5080 5085                                   * zone_destroy_cv yet.  Wait wait_time clock
5081 5086                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5082 5087                                   * seconds) for the zone's references to clear.
5083 5088                                   */
5084 5089                                  ASSERT(wait_time > 0);
5085 5090                                  wait_time = cv_reltimedwait_sig(
5086 5091                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5087 5092                                      TR_SEC);
5088 5093                                  if (wait_time > 0) {
5089 5094                                          /*
5090 5095                                           * A thread in zone_rele() or
5091 5096                                           * zone_cred_rele() signaled
5092 5097                                           * zone_destroy_cv before this thread's
5093 5098                                           * wait timed out.  The zone might have
5094 5099                                           * only one reference left; find out!
5095 5100                                           */
5096 5101                                          continue;
5097 5102                                  } else if (wait_time == 0) {
5098 5103                                          /* The thread's process was signaled. */
5099 5104                                          mutex_exit(&zonehash_lock);
5100 5105                                          return (set_errno(EINTR));
5101 5106                                  }
5102 5107  
5103 5108                                  /*
5104 5109                                   * The thread timed out while waiting on
5105 5110                                   * zone_destroy_cv.  Even though the thread
5106 5111                                   * timed out, it has to check whether another
5107 5112                                   * thread woke up from zone_destroy_cv and
5108 5113                                   * destroyed the zone.
5109 5114                                   *
5110 5115                                   * If the zone still exists and has more than
5111 5116                                   * one unreleased general-purpose reference,
5112 5117                                   * then log the zone's reference counts.
5113 5118                                   */
5114 5119                                  log_refcounts = B_TRUE;
5115 5120                                  continue;
5116 5121                          }
5117 5122  
5118 5123                          /*
5119 5124                           * The thread already timed out on zone_destroy_cv while
5120 5125                           * waiting for subsystems to release the zone's last
5121 5126                           * general-purpose references.  Log the zone's reference
5122 5127                           * counts and wait indefinitely on zone_destroy_cv.
5123 5128                           */
5124 5129                          zone_log_refcounts(zone);
5125 5130                  }
5126 5131                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5127 5132                          /* The thread's process was signaled. */
5128 5133                          mutex_exit(&zonehash_lock);
5129 5134                          return (set_errno(EINTR));
5130 5135                  }
5131 5136          }
5132 5137  
5133 5138          /*
5134 5139           * Remove CPU cap for this zone now since we're not going to
5135 5140           * fail below this point.
5136 5141           */
5137 5142          cpucaps_zone_remove(zone);
5138 5143  
5139 5144          /* Get rid of the zone's kstats */
5140 5145          zone_kstat_delete(zone);
5141 5146  
5142 5147          /* remove the pfexecd doors */
5143 5148          if (zone->zone_pfexecd != NULL) {
5144 5149                  klpd_freelist(&zone->zone_pfexecd);
5145 5150                  zone->zone_pfexecd = NULL;
5146 5151          }
5147 5152  
5148 5153          /* free brand specific data */
5149 5154          if (ZONE_IS_BRANDED(zone))
5150 5155                  ZBROP(zone)->b_free_brand_data(zone);
5151 5156  
5152 5157          /* Say goodbye to brand framework. */
5153 5158          brand_unregister_zone(zone->zone_brand);
5154 5159  
5155 5160          /*
5156 5161           * It is now safe to let the zone be recreated; remove it from the
5157 5162           * lists.  The memory will not be freed until the last cred
5158 5163           * reference goes away.
5159 5164           */
5160 5165          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5161 5166          zonecount--;
5162 5167          /* remove from active list and hash tables */
5163 5168          list_remove(&zone_active, zone);
5164 5169          (void) mod_hash_destroy(zonehashbyname,
5165 5170              (mod_hash_key_t)zone->zone_name);
5166 5171          (void) mod_hash_destroy(zonehashbyid,
5167 5172              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5168 5173          if (zone->zone_flags & ZF_HASHED_LABEL)
5169 5174                  (void) mod_hash_destroy(zonehashbylabel,
5170 5175                      (mod_hash_key_t)zone->zone_slabel);
5171 5176          mutex_exit(&zonehash_lock);
5172 5177  
5173 5178          /*
5174 5179           * Release the root vnode; we're not using it anymore.  Nor should any
5175 5180           * other thread that might access it exist.
5176 5181           */
5177 5182          if (zone->zone_rootvp != NULL) {
5178 5183                  VN_RELE(zone->zone_rootvp);
5179 5184                  zone->zone_rootvp = NULL;
5180 5185          }
5181 5186  
5182 5187          /* add to deathrow list */
5183 5188          mutex_enter(&zone_deathrow_lock);
5184 5189          list_insert_tail(&zone_deathrow, zone);
5185 5190          mutex_exit(&zone_deathrow_lock);
5186 5191  
5187 5192          /*
5188 5193           * Drop last reference (which was added by zsched()), this will
5189 5194           * free the zone unless there are outstanding cred references.
5190 5195           */
5191 5196          zone_rele(zone);
5192 5197          return (0);
5193 5198  }
5194 5199  
5195 5200  /*
5196 5201   * Systemcall entry point for zone_getattr(2).
5197 5202   */
5198 5203  static ssize_t
5199 5204  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5200 5205  {
5201 5206          size_t size;
5202 5207          int error = 0, err;
5203 5208          zone_t *zone;
5204 5209          char *zonepath;
5205 5210          char *outstr;
5206 5211          zone_status_t zone_status;
5207 5212          pid_t initpid;
5208 5213          boolean_t global = (curzone == global_zone);
5209 5214          boolean_t inzone = (curzone->zone_id == zoneid);
5210 5215          ushort_t flags;
5211 5216          zone_net_data_t *zbuf;
5212 5217  
5213 5218          mutex_enter(&zonehash_lock);
5214 5219          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5215 5220                  mutex_exit(&zonehash_lock);
5216 5221                  return (set_errno(EINVAL));
5217 5222          }
5218 5223          zone_status = zone_status_get(zone);
5219 5224          if (zone_status < ZONE_IS_INITIALIZED) {
5220 5225                  mutex_exit(&zonehash_lock);
5221 5226                  return (set_errno(EINVAL));
5222 5227          }
5223 5228          zone_hold(zone);
5224 5229          mutex_exit(&zonehash_lock);
5225 5230  
5226 5231          /*
5227 5232           * If not in the global zone, don't show information about other zones,
5228 5233           * unless the system is labeled and the local zone's label dominates
5229 5234           * the other zone.
5230 5235           */
5231 5236          if (!zone_list_access(zone)) {
5232 5237                  zone_rele(zone);
5233 5238                  return (set_errno(EINVAL));
5234 5239          }
5235 5240  
5236 5241          switch (attr) {
5237 5242          case ZONE_ATTR_ROOT:
5238 5243                  if (global) {
5239 5244                          /*
5240 5245                           * Copy the path to trim the trailing "/" (except for
5241 5246                           * the global zone).
5242 5247                           */
5243 5248                          if (zone != global_zone)
5244 5249                                  size = zone->zone_rootpathlen - 1;
5245 5250                          else
5246 5251                                  size = zone->zone_rootpathlen;
5247 5252                          zonepath = kmem_alloc(size, KM_SLEEP);
5248 5253                          bcopy(zone->zone_rootpath, zonepath, size);
5249 5254                          zonepath[size - 1] = '\0';
5250 5255                  } else {
5251 5256                          if (inzone || !is_system_labeled()) {
5252 5257                                  /*
5253 5258                                   * Caller is not in the global zone.
5254 5259                                   * if the query is on the current zone
5255 5260                                   * or the system is not labeled,
5256 5261                                   * just return faked-up path for current zone.
5257 5262                                   */
5258 5263                                  zonepath = "/";
5259 5264                                  size = 2;
5260 5265                          } else {
5261 5266                                  /*
5262 5267                                   * Return related path for current zone.
5263 5268                                   */
5264 5269                                  int prefix_len = strlen(zone_prefix);
5265 5270                                  int zname_len = strlen(zone->zone_name);
5266 5271  
5267 5272                                  size = prefix_len + zname_len + 1;
5268 5273                                  zonepath = kmem_alloc(size, KM_SLEEP);
5269 5274                                  bcopy(zone_prefix, zonepath, prefix_len);
5270 5275                                  bcopy(zone->zone_name, zonepath +
5271 5276                                      prefix_len, zname_len);
5272 5277                                  zonepath[size - 1] = '\0';
5273 5278                          }
5274 5279                  }
5275 5280                  if (bufsize > size)
5276 5281                          bufsize = size;
5277 5282                  if (buf != NULL) {
5278 5283                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5279 5284                          if (err != 0 && err != ENAMETOOLONG)
5280 5285                                  error = EFAULT;
5281 5286                  }
5282 5287                  if (global || (is_system_labeled() && !inzone))
5283 5288                          kmem_free(zonepath, size);
5284 5289                  break;
5285 5290  
5286 5291          case ZONE_ATTR_NAME:
5287 5292                  size = strlen(zone->zone_name) + 1;
5288 5293                  if (bufsize > size)
5289 5294                          bufsize = size;
5290 5295                  if (buf != NULL) {
5291 5296                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5292 5297                          if (err != 0 && err != ENAMETOOLONG)
5293 5298                                  error = EFAULT;
5294 5299                  }
5295 5300                  break;
5296 5301  
5297 5302          case ZONE_ATTR_STATUS:
5298 5303                  /*
5299 5304                   * Since we're not holding zonehash_lock, the zone status
5300 5305                   * may be anything; leave it up to userland to sort it out.
5301 5306                   */
5302 5307                  size = sizeof (zone_status);
5303 5308                  if (bufsize > size)
5304 5309                          bufsize = size;
5305 5310                  zone_status = zone_status_get(zone);
5306 5311                  if (buf != NULL &&
5307 5312                      copyout(&zone_status, buf, bufsize) != 0)
5308 5313                          error = EFAULT;
5309 5314                  break;
5310 5315          case ZONE_ATTR_FLAGS:
5311 5316                  size = sizeof (zone->zone_flags);
5312 5317                  if (bufsize > size)
5313 5318                          bufsize = size;
5314 5319                  flags = zone->zone_flags;
5315 5320                  if (buf != NULL &&
5316 5321                      copyout(&flags, buf, bufsize) != 0)
5317 5322                          error = EFAULT;
5318 5323                  break;
5319 5324          case ZONE_ATTR_PRIVSET:
5320 5325                  size = sizeof (priv_set_t);
5321 5326                  if (bufsize > size)
5322 5327                          bufsize = size;
5323 5328                  if (buf != NULL &&
5324 5329                      copyout(zone->zone_privset, buf, bufsize) != 0)
5325 5330                          error = EFAULT;
5326 5331                  break;
5327 5332          case ZONE_ATTR_UNIQID:
5328 5333                  size = sizeof (zone->zone_uniqid);
5329 5334                  if (bufsize > size)
5330 5335                          bufsize = size;
5331 5336                  if (buf != NULL &&
5332 5337                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5333 5338                          error = EFAULT;
5334 5339                  break;
5335 5340          case ZONE_ATTR_POOLID:
5336 5341                  {
5337 5342                          pool_t *pool;
5338 5343                          poolid_t poolid;
5339 5344  
5340 5345                          if (pool_lock_intr() != 0) {
5341 5346                                  error = EINTR;
5342 5347                                  break;
5343 5348                          }
5344 5349                          pool = zone_pool_get(zone);
5345 5350                          poolid = pool->pool_id;
5346 5351                          pool_unlock();
5347 5352                          size = sizeof (poolid);
5348 5353                          if (bufsize > size)
5349 5354                                  bufsize = size;
5350 5355                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5351 5356                                  error = EFAULT;
5352 5357                  }
5353 5358                  break;
5354 5359          case ZONE_ATTR_SLBL:
5355 5360                  size = sizeof (bslabel_t);
5356 5361                  if (bufsize > size)
5357 5362                          bufsize = size;
5358 5363                  if (zone->zone_slabel == NULL)
5359 5364                          error = EINVAL;
5360 5365                  else if (buf != NULL &&
5361 5366                      copyout(label2bslabel(zone->zone_slabel), buf,
5362 5367                      bufsize) != 0)
5363 5368                          error = EFAULT;
5364 5369                  break;
5365 5370          case ZONE_ATTR_INITPID:
5366 5371                  size = sizeof (initpid);
5367 5372                  if (bufsize > size)
5368 5373                          bufsize = size;
5369 5374                  initpid = zone->zone_proc_initpid;
5370 5375                  if (initpid == -1) {
5371 5376                          error = ESRCH;
5372 5377                          break;
5373 5378                  }
5374 5379                  if (buf != NULL &&
5375 5380                      copyout(&initpid, buf, bufsize) != 0)
5376 5381                          error = EFAULT;
5377 5382                  break;
5378 5383          case ZONE_ATTR_BRAND:
5379 5384                  size = strlen(zone->zone_brand->b_name) + 1;
5380 5385  
5381 5386                  if (bufsize > size)
5382 5387                          bufsize = size;
5383 5388                  if (buf != NULL) {
5384 5389                          err = copyoutstr(zone->zone_brand->b_name, buf,
5385 5390                              bufsize, NULL);
5386 5391                          if (err != 0 && err != ENAMETOOLONG)
5387 5392                                  error = EFAULT;
5388 5393                  }
5389 5394                  break;
5390 5395          case ZONE_ATTR_INITNAME:
5391 5396                  size = strlen(zone->zone_initname) + 1;
5392 5397                  if (bufsize > size)
5393 5398                          bufsize = size;
5394 5399                  if (buf != NULL) {
5395 5400                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5396 5401                              NULL);
5397 5402                          if (err != 0 && err != ENAMETOOLONG)
5398 5403                                  error = EFAULT;
5399 5404                  }
5400 5405                  break;
5401 5406          case ZONE_ATTR_BOOTARGS:
5402 5407                  if (zone->zone_bootargs == NULL)
5403 5408                          outstr = "";
5404 5409                  else
5405 5410                          outstr = zone->zone_bootargs;
5406 5411                  size = strlen(outstr) + 1;
5407 5412                  if (bufsize > size)
5408 5413                          bufsize = size;
5409 5414                  if (buf != NULL) {
5410 5415                          err = copyoutstr(outstr, buf, bufsize, NULL);
5411 5416                          if (err != 0 && err != ENAMETOOLONG)
5412 5417                                  error = EFAULT;
5413 5418                  }
5414 5419                  break;
5415 5420          case ZONE_ATTR_PHYS_MCAP:
5416 5421                  size = sizeof (zone->zone_phys_mcap);
5417 5422                  if (bufsize > size)
5418 5423                          bufsize = size;
5419 5424                  if (buf != NULL &&
5420 5425                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5421 5426                          error = EFAULT;
5422 5427                  break;
5423 5428          case ZONE_ATTR_SCHED_CLASS:
5424 5429                  mutex_enter(&class_lock);
5425 5430  
5426 5431                  if (zone->zone_defaultcid >= loaded_classes)
5427 5432                          outstr = "";
5428 5433                  else
5429 5434                          outstr = sclass[zone->zone_defaultcid].cl_name;
5430 5435                  size = strlen(outstr) + 1;
5431 5436                  if (bufsize > size)
5432 5437                          bufsize = size;
5433 5438                  if (buf != NULL) {
5434 5439                          err = copyoutstr(outstr, buf, bufsize, NULL);
5435 5440                          if (err != 0 && err != ENAMETOOLONG)
5436 5441                                  error = EFAULT;
5437 5442                  }
5438 5443  
5439 5444                  mutex_exit(&class_lock);
5440 5445                  break;
5441 5446          case ZONE_ATTR_HOSTID:
5442 5447                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5443 5448                      bufsize == sizeof (zone->zone_hostid)) {
5444 5449                          size = sizeof (zone->zone_hostid);
5445 5450                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5446 5451                              bufsize) != 0)
5447 5452                                  error = EFAULT;
5448 5453                  } else {
5449 5454                          error = EINVAL;
5450 5455                  }
5451 5456                  break;
5452 5457          case ZONE_ATTR_FS_ALLOWED:
5453 5458                  if (zone->zone_fs_allowed == NULL)
5454 5459                          outstr = "";
5455 5460                  else
5456 5461                          outstr = zone->zone_fs_allowed;
5457 5462                  size = strlen(outstr) + 1;
5458 5463                  if (bufsize > size)
5459 5464                          bufsize = size;
5460 5465                  if (buf != NULL) {
5461 5466                          err = copyoutstr(outstr, buf, bufsize, NULL);
5462 5467                          if (err != 0 && err != ENAMETOOLONG)
5463 5468                                  error = EFAULT;
5464 5469                  }
5465 5470                  break;
5466 5471          case ZONE_ATTR_NETWORK:
5467 5472                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5468 5473                  if (copyin(buf, zbuf, bufsize) != 0) {
5469 5474                          error = EFAULT;
5470 5475                  } else {
5471 5476                          error = zone_get_network(zoneid, zbuf);
5472 5477                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5473 5478                                  error = EFAULT;
5474 5479                  }
5475 5480                  kmem_free(zbuf, bufsize);
5476 5481                  break;
5477 5482          default:
5478 5483                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5479 5484                          size = bufsize;
5480 5485                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5481 5486                  } else {
5482 5487                          error = EINVAL;
5483 5488                  }
5484 5489          }
5485 5490          zone_rele(zone);
5486 5491  
5487 5492          if (error)
5488 5493                  return (set_errno(error));
5489 5494          return ((ssize_t)size);
5490 5495  }
5491 5496  
5492 5497  /*
5493 5498   * Systemcall entry point for zone_setattr(2).
5494 5499   */
5495 5500  /*ARGSUSED*/
5496 5501  static int
5497 5502  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5498 5503  {
5499 5504          zone_t *zone;
5500 5505          zone_status_t zone_status;
5501 5506          int err = -1;
5502 5507          zone_net_data_t *zbuf;
5503 5508  
5504 5509          if (secpolicy_zone_config(CRED()) != 0)
5505 5510                  return (set_errno(EPERM));
5506 5511  
5507 5512          /*
5508 5513           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5509 5514           * global zone.
5510 5515           */
5511 5516          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5512 5517                  return (set_errno(EINVAL));
5513 5518          }
5514 5519  
5515 5520          mutex_enter(&zonehash_lock);
5516 5521          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5517 5522                  mutex_exit(&zonehash_lock);
5518 5523                  return (set_errno(EINVAL));
5519 5524          }
5520 5525          zone_hold(zone);
5521 5526          mutex_exit(&zonehash_lock);
5522 5527  
5523 5528          /*
5524 5529           * At present most attributes can only be set on non-running,
5525 5530           * non-global zones.
5526 5531           */
5527 5532          zone_status = zone_status_get(zone);
5528 5533          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5529 5534                  err = EINVAL;
5530 5535                  goto done;
5531 5536          }
5532 5537  
5533 5538          switch (attr) {
5534 5539          case ZONE_ATTR_INITNAME:
5535 5540                  err = zone_set_initname(zone, (const char *)buf);
5536 5541                  break;
5537 5542          case ZONE_ATTR_INITNORESTART:
5538 5543                  zone->zone_restart_init = B_FALSE;
5539 5544                  err = 0;
5540 5545                  break;
5541 5546          case ZONE_ATTR_BOOTARGS:
5542 5547                  err = zone_set_bootargs(zone, (const char *)buf);
5543 5548                  break;
5544 5549          case ZONE_ATTR_BRAND:
5545 5550                  err = zone_set_brand(zone, (const char *)buf);
5546 5551                  break;
5547 5552          case ZONE_ATTR_FS_ALLOWED:
5548 5553                  err = zone_set_fs_allowed(zone, (const char *)buf);
5549 5554                  break;
5550 5555          case ZONE_ATTR_PHYS_MCAP:
5551 5556                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5552 5557                  break;
5553 5558          case ZONE_ATTR_SCHED_CLASS:
5554 5559                  err = zone_set_sched_class(zone, (const char *)buf);
5555 5560                  break;
5556 5561          case ZONE_ATTR_HOSTID:
5557 5562                  if (bufsize == sizeof (zone->zone_hostid)) {
5558 5563                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5559 5564                                  err = 0;
5560 5565                          else
5561 5566                                  err = EFAULT;
5562 5567                  } else {
5563 5568                          err = EINVAL;
5564 5569                  }
5565 5570                  break;
5566 5571          case ZONE_ATTR_NETWORK:
5567 5572                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5568 5573                          err = EINVAL;
5569 5574                          break;
5570 5575                  }
5571 5576                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5572 5577                  if (copyin(buf, zbuf, bufsize) != 0) {
5573 5578                          kmem_free(zbuf, bufsize);
5574 5579                          err = EFAULT;
5575 5580                          break;
5576 5581                  }
5577 5582                  err = zone_set_network(zoneid, zbuf);
5578 5583                  kmem_free(zbuf, bufsize);
5579 5584                  break;
5580 5585          default:
5581 5586                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5582 5587                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5583 5588                  else
5584 5589                          err = EINVAL;
5585 5590          }
5586 5591  
5587 5592  done:
5588 5593          zone_rele(zone);
5589 5594          ASSERT(err != -1);
5590 5595          return (err != 0 ? set_errno(err) : 0);
5591 5596  }
5592 5597  
5593 5598  /*
5594 5599   * Return zero if the process has at least one vnode mapped in to its
5595 5600   * address space which shouldn't be allowed to change zones.
5596 5601   *
5597 5602   * Also return zero if the process has any shared mappings which reserve
5598 5603   * swap.  This is because the counting for zone.max-swap does not allow swap
5599 5604   * reservation to be shared between zones.  zone swap reservation is counted
5600 5605   * on zone->zone_max_swap.
5601 5606   */
5602 5607  static int
5603 5608  as_can_change_zones(void)
5604 5609  {
5605 5610          proc_t *pp = curproc;
5606 5611          struct seg *seg;
5607 5612          struct as *as = pp->p_as;
5608 5613          vnode_t *vp;
5609 5614          int allow = 1;
5610 5615  
5611 5616          ASSERT(pp->p_as != &kas);
5612 5617          AS_LOCK_ENTER(as, RW_READER);
5613 5618          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5614 5619  
5615 5620                  /*
5616 5621                   * Cannot enter zone with shared anon memory which
5617 5622                   * reserves swap.  See comment above.
5618 5623                   */
5619 5624                  if (seg_can_change_zones(seg) == B_FALSE) {
5620 5625                          allow = 0;
5621 5626                          break;
5622 5627                  }
5623 5628                  /*
5624 5629                   * if we can't get a backing vnode for this segment then skip
5625 5630                   * it.
5626 5631                   */
5627 5632                  vp = NULL;
5628 5633                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5629 5634                          continue;
5630 5635                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5631 5636                          allow = 0;
5632 5637                          break;
5633 5638                  }
5634 5639          }
5635 5640          AS_LOCK_EXIT(as);
5636 5641          return (allow);
5637 5642  }
5638 5643  
5639 5644  /*
5640 5645   * Count swap reserved by curproc's address space
5641 5646   */
5642 5647  static size_t
5643 5648  as_swresv(void)
5644 5649  {
5645 5650          proc_t *pp = curproc;
5646 5651          struct seg *seg;
5647 5652          struct as *as = pp->p_as;
5648 5653          size_t swap = 0;
5649 5654  
5650 5655          ASSERT(pp->p_as != &kas);
5651 5656          ASSERT(AS_WRITE_HELD(as));
5652 5657          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5653 5658                  swap += seg_swresv(seg);
5654 5659  
5655 5660          return (swap);
5656 5661  }
5657 5662  
5658 5663  /*
5659 5664   * Systemcall entry point for zone_enter().
5660 5665   *
5661 5666   * The current process is injected into said zone.  In the process
5662 5667   * it will change its project membership, privileges, rootdir/cwd,
5663 5668   * zone-wide rctls, and pool association to match those of the zone.
5664 5669   *
5665 5670   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5666 5671   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5667 5672   * enter a zone that is "ready" or "running".
5668 5673   */
5669 5674  static int
5670 5675  zone_enter(zoneid_t zoneid)
5671 5676  {
5672 5677          zone_t *zone;
5673 5678          vnode_t *vp;
5674 5679          proc_t *pp = curproc;
5675 5680          contract_t *ct;
5676 5681          cont_process_t *ctp;
5677 5682          task_t *tk, *oldtk;
5678 5683          kproject_t *zone_proj0;
5679 5684          cred_t *cr, *newcr;
5680 5685          pool_t *oldpool, *newpool;
5681 5686          sess_t *sp;
5682 5687          uid_t uid;
5683 5688          zone_status_t status;
5684 5689          int err = 0;
5685 5690          rctl_entity_p_t e;
5686 5691          size_t swap;
5687 5692          kthread_id_t t;
5688 5693  
5689 5694          if (secpolicy_zone_config(CRED()) != 0)
5690 5695                  return (set_errno(EPERM));
5691 5696          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5692 5697                  return (set_errno(EINVAL));
5693 5698  
5694 5699          /*
5695 5700           * Stop all lwps so we don't need to hold a lock to look at
5696 5701           * curproc->p_zone.  This needs to happen before we grab any
5697 5702           * locks to avoid deadlock (another lwp in the process could
5698 5703           * be waiting for the held lock).
5699 5704           */
5700 5705          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5701 5706                  return (set_errno(EINTR));
5702 5707  
5703 5708          /*
5704 5709           * Make sure we're not changing zones with files open or mapped in
5705 5710           * to our address space which shouldn't be changing zones.
5706 5711           */
5707 5712          if (!files_can_change_zones()) {
5708 5713                  err = EBADF;
5709 5714                  goto out;
5710 5715          }
5711 5716          if (!as_can_change_zones()) {
5712 5717                  err = EFAULT;
5713 5718                  goto out;
5714 5719          }
5715 5720  
5716 5721          mutex_enter(&zonehash_lock);
5717 5722          if (pp->p_zone != global_zone) {
5718 5723                  mutex_exit(&zonehash_lock);
5719 5724                  err = EINVAL;
5720 5725                  goto out;
5721 5726          }
5722 5727  
5723 5728          zone = zone_find_all_by_id(zoneid);
5724 5729          if (zone == NULL) {
5725 5730                  mutex_exit(&zonehash_lock);
5726 5731                  err = EINVAL;
5727 5732                  goto out;
5728 5733          }
5729 5734  
5730 5735          /*
5731 5736           * To prevent processes in a zone from holding contracts on
5732 5737           * extrazonal resources, and to avoid process contract
5733 5738           * memberships which span zones, contract holders and processes
5734 5739           * which aren't the sole members of their encapsulating process
5735 5740           * contracts are not allowed to zone_enter.
5736 5741           */
5737 5742          ctp = pp->p_ct_process;
5738 5743          ct = &ctp->conp_contract;
5739 5744          mutex_enter(&ct->ct_lock);
5740 5745          mutex_enter(&pp->p_lock);
5741 5746          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5742 5747                  mutex_exit(&pp->p_lock);
5743 5748                  mutex_exit(&ct->ct_lock);
5744 5749                  mutex_exit(&zonehash_lock);
5745 5750                  err = EINVAL;
5746 5751                  goto out;
5747 5752          }
5748 5753  
5749 5754          /*
5750 5755           * Moreover, we don't allow processes whose encapsulating
5751 5756           * process contracts have inherited extrazonal contracts.
5752 5757           * While it would be easier to eliminate all process contracts
5753 5758           * with inherited contracts, we need to be able to give a
5754 5759           * restarted init (or other zone-penetrating process) its
5755 5760           * predecessor's contracts.
5756 5761           */
5757 5762          if (ctp->conp_ninherited != 0) {
5758 5763                  contract_t *next;
5759 5764                  for (next = list_head(&ctp->conp_inherited); next;
5760 5765                      next = list_next(&ctp->conp_inherited, next)) {
5761 5766                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5762 5767                                  mutex_exit(&pp->p_lock);
5763 5768                                  mutex_exit(&ct->ct_lock);
5764 5769                                  mutex_exit(&zonehash_lock);
5765 5770                                  err = EINVAL;
5766 5771                                  goto out;
5767 5772                          }
5768 5773                  }
5769 5774          }
5770 5775  
5771 5776          mutex_exit(&pp->p_lock);
5772 5777          mutex_exit(&ct->ct_lock);
5773 5778  
5774 5779          status = zone_status_get(zone);
5775 5780          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5776 5781                  /*
5777 5782                   * Can't join
5778 5783                   */
5779 5784                  mutex_exit(&zonehash_lock);
5780 5785                  err = EINVAL;
5781 5786                  goto out;
5782 5787          }
5783 5788  
5784 5789          /*
5785 5790           * Make sure new priv set is within the permitted set for caller
5786 5791           */
5787 5792          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5788 5793                  mutex_exit(&zonehash_lock);
5789 5794                  err = EPERM;
5790 5795                  goto out;
5791 5796          }
5792 5797          /*
5793 5798           * We want to momentarily drop zonehash_lock while we optimistically
5794 5799           * bind curproc to the pool it should be running in.  This is safe
5795 5800           * since the zone can't disappear (we have a hold on it).
5796 5801           */
5797 5802          zone_hold(zone);
5798 5803          mutex_exit(&zonehash_lock);
5799 5804  
5800 5805          /*
5801 5806           * Grab pool_lock to keep the pools configuration from changing
5802 5807           * and to stop ourselves from getting rebound to another pool
5803 5808           * until we join the zone.
5804 5809           */
5805 5810          if (pool_lock_intr() != 0) {
5806 5811                  zone_rele(zone);
5807 5812                  err = EINTR;
5808 5813                  goto out;
5809 5814          }
5810 5815          ASSERT(secpolicy_pool(CRED()) == 0);
5811 5816          /*
5812 5817           * Bind ourselves to the pool currently associated with the zone.
5813 5818           */
5814 5819          oldpool = curproc->p_pool;
5815 5820          newpool = zone_pool_get(zone);
5816 5821          if (pool_state == POOL_ENABLED && newpool != oldpool &&
5817 5822              (err = pool_do_bind(newpool, P_PID, P_MYID,
5818 5823              POOL_BIND_ALL)) != 0) {
5819 5824                  pool_unlock();
5820 5825                  zone_rele(zone);
5821 5826                  goto out;
5822 5827          }
5823 5828  
5824 5829          /*
5825 5830           * Grab cpu_lock now; we'll need it later when we call
5826 5831           * task_join().
5827 5832           */
5828 5833          mutex_enter(&cpu_lock);
5829 5834          mutex_enter(&zonehash_lock);
5830 5835          /*
5831 5836           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5832 5837           */
5833 5838          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5834 5839                  /*
5835 5840                   * Can't join anymore.
5836 5841                   */
5837 5842                  mutex_exit(&zonehash_lock);
5838 5843                  mutex_exit(&cpu_lock);
5839 5844                  if (pool_state == POOL_ENABLED &&
5840 5845                      newpool != oldpool)
5841 5846                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
5842 5847                              POOL_BIND_ALL);
5843 5848                  pool_unlock();
5844 5849                  zone_rele(zone);
5845 5850                  err = EINVAL;
5846 5851                  goto out;
5847 5852          }
5848 5853  
5849 5854          /*
5850 5855           * a_lock must be held while transfering locked memory and swap
5851 5856           * reservation from the global zone to the non global zone because
5852 5857           * asynchronous faults on the processes' address space can lock
5853 5858           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5854 5859           * segments respectively.
5855 5860           */
5856 5861          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
5857 5862          swap = as_swresv();
5858 5863          mutex_enter(&pp->p_lock);
5859 5864          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5860 5865          /* verify that we do not exceed and task or lwp limits */
5861 5866          mutex_enter(&zone->zone_nlwps_lock);
5862 5867          /* add new lwps to zone and zone's proj0 */
5863 5868          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5864 5869          zone->zone_nlwps += pp->p_lwpcnt;
5865 5870          /* add 1 task to zone's proj0 */
5866 5871          zone_proj0->kpj_ntasks += 1;
5867 5872  
5868 5873          zone_proj0->kpj_nprocs++;
5869 5874          zone->zone_nprocs++;
5870 5875          mutex_exit(&zone->zone_nlwps_lock);
5871 5876  
5872 5877          mutex_enter(&zone->zone_mem_lock);
5873 5878          zone->zone_locked_mem += pp->p_locked_mem;
5874 5879          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5875 5880          zone->zone_max_swap += swap;
5876 5881          mutex_exit(&zone->zone_mem_lock);
5877 5882  
5878 5883          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5879 5884          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5880 5885          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5881 5886  
5882 5887          /* remove lwps and process from proc's old zone and old project */
5883 5888          mutex_enter(&pp->p_zone->zone_nlwps_lock);
5884 5889          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5885 5890          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5886 5891          pp->p_task->tk_proj->kpj_nprocs--;
5887 5892          pp->p_zone->zone_nprocs--;
5888 5893          mutex_exit(&pp->p_zone->zone_nlwps_lock);
5889 5894  
5890 5895          mutex_enter(&pp->p_zone->zone_mem_lock);
5891 5896          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5892 5897          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5893 5898          pp->p_zone->zone_max_swap -= swap;
5894 5899          mutex_exit(&pp->p_zone->zone_mem_lock);
5895 5900  
5896 5901          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5897 5902          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5898 5903          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5899 5904  
5900 5905          pp->p_flag |= SZONETOP;
5901 5906          pp->p_zone = zone;
5902 5907          mutex_exit(&pp->p_lock);
5903 5908          AS_LOCK_EXIT(pp->p_as);
5904 5909  
5905 5910          /*
5906 5911           * Joining the zone cannot fail from now on.
5907 5912           *
5908 5913           * This means that a lot of the following code can be commonized and
5909 5914           * shared with zsched().
5910 5915           */
5911 5916  
5912 5917          /*
5913 5918           * If the process contract fmri was inherited, we need to
5914 5919           * flag this so that any contract status will not leak
5915 5920           * extra zone information, svc_fmri in this case
5916 5921           */
5917 5922          if (ctp->conp_svc_ctid != ct->ct_id) {
5918 5923                  mutex_enter(&ct->ct_lock);
5919 5924                  ctp->conp_svc_zone_enter = ct->ct_id;
5920 5925                  mutex_exit(&ct->ct_lock);
5921 5926          }
5922 5927  
5923 5928          /*
5924 5929           * Reset the encapsulating process contract's zone.
5925 5930           */
5926 5931          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5927 5932          contract_setzuniqid(ct, zone->zone_uniqid);
5928 5933  
5929 5934          /*
5930 5935           * Create a new task and associate the process with the project keyed
5931 5936           * by (projid,zoneid).
5932 5937           *
5933 5938           * We might as well be in project 0; the global zone's projid doesn't
5934 5939           * make much sense in a zone anyhow.
5935 5940           *
5936 5941           * This also increments zone_ntasks, and returns with p_lock held.
5937 5942           */
5938 5943          tk = task_create(0, zone);
5939 5944          oldtk = task_join(tk, 0);
5940 5945          mutex_exit(&cpu_lock);
5941 5946  
5942 5947          /*
5943 5948           * call RCTLOP_SET functions on this proc
5944 5949           */
5945 5950          e.rcep_p.zone = zone;
5946 5951          e.rcep_t = RCENTITY_ZONE;
5947 5952          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5948 5953              RCD_CALLBACK);
5949 5954          mutex_exit(&pp->p_lock);
5950 5955  
5951 5956          /*
5952 5957           * We don't need to hold any of zsched's locks here; not only do we know
5953 5958           * the process and zone aren't going away, we know its session isn't
5954 5959           * changing either.
5955 5960           *
5956 5961           * By joining zsched's session here, we mimic the behavior in the
5957 5962           * global zone of init's sid being the pid of sched.  We extend this
5958 5963           * to all zlogin-like zone_enter()'ing processes as well.
5959 5964           */
5960 5965          mutex_enter(&pidlock);
5961 5966          sp = zone->zone_zsched->p_sessp;
5962 5967          sess_hold(zone->zone_zsched);
5963 5968          mutex_enter(&pp->p_lock);
5964 5969          pgexit(pp);
5965 5970          sess_rele(pp->p_sessp, B_TRUE);
5966 5971          pp->p_sessp = sp;
5967 5972          pgjoin(pp, zone->zone_zsched->p_pidp);
5968 5973  
5969 5974          /*
5970 5975           * If any threads are scheduled to be placed on zone wait queue they
5971 5976           * should abandon the idea since the wait queue is changing.
5972 5977           * We need to be holding pidlock & p_lock to do this.
5973 5978           */
5974 5979          if ((t = pp->p_tlist) != NULL) {
5975 5980                  do {
5976 5981                          thread_lock(t);
5977 5982                          /*
5978 5983                           * Kick this thread so that he doesn't sit
5979 5984                           * on a wrong wait queue.
5980 5985                           */
5981 5986                          if (ISWAITING(t))
5982 5987                                  setrun_locked(t);
5983 5988  
5984 5989                          if (t->t_schedflag & TS_ANYWAITQ)
5985 5990                                  t->t_schedflag &= ~ TS_ANYWAITQ;
5986 5991  
5987 5992                          thread_unlock(t);
5988 5993                  } while ((t = t->t_forw) != pp->p_tlist);
5989 5994          }
5990 5995  
5991 5996          /*
5992 5997           * If there is a default scheduling class for the zone and it is not
5993 5998           * the class we are currently in, change all of the threads in the
5994 5999           * process to the new class.  We need to be holding pidlock & p_lock
5995 6000           * when we call parmsset so this is a good place to do it.
5996 6001           */
5997 6002          if (zone->zone_defaultcid > 0 &&
5998 6003              zone->zone_defaultcid != curthread->t_cid) {
5999 6004                  pcparms_t pcparms;
6000 6005  
6001 6006                  pcparms.pc_cid = zone->zone_defaultcid;
6002 6007                  pcparms.pc_clparms[0] = 0;
6003 6008  
6004 6009                  /*
6005 6010                   * If setting the class fails, we still want to enter the zone.
6006 6011                   */
6007 6012                  if ((t = pp->p_tlist) != NULL) {
6008 6013                          do {
6009 6014                                  (void) parmsset(&pcparms, t);
6010 6015                          } while ((t = t->t_forw) != pp->p_tlist);
6011 6016                  }
6012 6017          }
6013 6018  
6014 6019          mutex_exit(&pp->p_lock);
6015 6020          mutex_exit(&pidlock);
6016 6021  
6017 6022          mutex_exit(&zonehash_lock);
6018 6023          /*
6019 6024           * We're firmly in the zone; let pools progress.
6020 6025           */
6021 6026          pool_unlock();
6022 6027          task_rele(oldtk);
6023 6028          /*
6024 6029           * We don't need to retain a hold on the zone since we already
6025 6030           * incremented zone_ntasks, so the zone isn't going anywhere.
6026 6031           */
6027 6032          zone_rele(zone);
6028 6033  
6029 6034          /*
6030 6035           * Chroot
6031 6036           */
6032 6037          vp = zone->zone_rootvp;
6033 6038          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6034 6039          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6035 6040  
6036 6041          /*
6037 6042           * Change process credentials
6038 6043           */
6039 6044          newcr = cralloc();
6040 6045          mutex_enter(&pp->p_crlock);
6041 6046          cr = pp->p_cred;
6042 6047          crcopy_to(cr, newcr);
6043 6048          crsetzone(newcr, zone);
6044 6049          pp->p_cred = newcr;
6045 6050  
6046 6051          /*
6047 6052           * Restrict all process privilege sets to zone limit
6048 6053           */
6049 6054          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6050 6055          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6051 6056          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6052 6057          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6053 6058          mutex_exit(&pp->p_crlock);
6054 6059          crset(pp, newcr);
6055 6060  
6056 6061          /*
6057 6062           * Adjust upcount to reflect zone entry.
6058 6063           */
6059 6064          uid = crgetruid(newcr);
6060 6065          mutex_enter(&pidlock);
6061 6066          upcount_dec(uid, GLOBAL_ZONEID);
6062 6067          upcount_inc(uid, zoneid);
6063 6068          mutex_exit(&pidlock);
6064 6069  
6065 6070          /*
6066 6071           * Set up core file path and content.
6067 6072           */
6068 6073          set_core_defaults();
6069 6074  
6070 6075  out:
6071 6076          /*
6072 6077           * Let the other lwps continue.
6073 6078           */
6074 6079          mutex_enter(&pp->p_lock);
6075 6080          if (curthread != pp->p_agenttp)
6076 6081                  continuelwps(pp);
6077 6082          mutex_exit(&pp->p_lock);
6078 6083  
6079 6084          return (err != 0 ? set_errno(err) : 0);
6080 6085  }
6081 6086  
6082 6087  /*
6083 6088   * Systemcall entry point for zone_list(2).
6084 6089   *
6085 6090   * Processes running in a (non-global) zone only see themselves.
6086 6091   * On labeled systems, they see all zones whose label they dominate.
6087 6092   */
6088 6093  static int
6089 6094  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6090 6095  {
6091 6096          zoneid_t *zoneids;
6092 6097          zone_t *zone, *myzone;
6093 6098          uint_t user_nzones, real_nzones;
6094 6099          uint_t domi_nzones;
6095 6100          int error;
6096 6101  
6097 6102          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6098 6103                  return (set_errno(EFAULT));
6099 6104  
6100 6105          myzone = curproc->p_zone;
6101 6106          if (myzone != global_zone) {
6102 6107                  bslabel_t *mybslab;
6103 6108  
6104 6109                  if (!is_system_labeled()) {
6105 6110                          /* just return current zone */
6106 6111                          real_nzones = domi_nzones = 1;
6107 6112                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6108 6113                          zoneids[0] = myzone->zone_id;
6109 6114                  } else {
6110 6115                          /* return all zones that are dominated */
6111 6116                          mutex_enter(&zonehash_lock);
6112 6117                          real_nzones = zonecount;
6113 6118                          domi_nzones = 0;
6114 6119                          if (real_nzones > 0) {
6115 6120                                  zoneids = kmem_alloc(real_nzones *
6116 6121                                      sizeof (zoneid_t), KM_SLEEP);
6117 6122                                  mybslab = label2bslabel(myzone->zone_slabel);
6118 6123                                  for (zone = list_head(&zone_active);
6119 6124                                      zone != NULL;
6120 6125                                      zone = list_next(&zone_active, zone)) {
6121 6126                                          if (zone->zone_id == GLOBAL_ZONEID)
6122 6127                                                  continue;
6123 6128                                          if (zone != myzone &&
6124 6129                                              (zone->zone_flags & ZF_IS_SCRATCH))
6125 6130                                                  continue;
6126 6131                                          /*
6127 6132                                           * Note that a label always dominates
6128 6133                                           * itself, so myzone is always included
6129 6134                                           * in the list.
6130 6135                                           */
6131 6136                                          if (bldominates(mybslab,
6132 6137                                              label2bslabel(zone->zone_slabel))) {
6133 6138                                                  zoneids[domi_nzones++] =
6134 6139                                                      zone->zone_id;
6135 6140                                          }
6136 6141                                  }
6137 6142                          }
6138 6143                          mutex_exit(&zonehash_lock);
6139 6144                  }
6140 6145          } else {
6141 6146                  mutex_enter(&zonehash_lock);
6142 6147                  real_nzones = zonecount;
6143 6148                  domi_nzones = 0;
6144 6149                  if (real_nzones > 0) {
6145 6150                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6146 6151                              KM_SLEEP);
6147 6152                          for (zone = list_head(&zone_active); zone != NULL;
6148 6153                              zone = list_next(&zone_active, zone))
6149 6154                                  zoneids[domi_nzones++] = zone->zone_id;
6150 6155                          ASSERT(domi_nzones == real_nzones);
6151 6156                  }
6152 6157                  mutex_exit(&zonehash_lock);
6153 6158          }
6154 6159  
6155 6160          /*
6156 6161           * If user has allocated space for fewer entries than we found, then
6157 6162           * return only up to his limit.  Either way, tell him exactly how many
6158 6163           * we found.
6159 6164           */
6160 6165          if (domi_nzones < user_nzones)
6161 6166                  user_nzones = domi_nzones;
6162 6167          error = 0;
6163 6168          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6164 6169                  error = EFAULT;
6165 6170          } else if (zoneidlist != NULL && user_nzones != 0) {
6166 6171                  if (copyout(zoneids, zoneidlist,
6167 6172                      user_nzones * sizeof (zoneid_t)) != 0)
6168 6173                          error = EFAULT;
6169 6174          }
6170 6175  
6171 6176          if (real_nzones > 0)
6172 6177                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6173 6178  
6174 6179          if (error != 0)
6175 6180                  return (set_errno(error));
6176 6181          else
6177 6182                  return (0);
6178 6183  }
6179 6184  
6180 6185  /*
6181 6186   * Systemcall entry point for zone_lookup(2).
6182 6187   *
6183 6188   * Non-global zones are only able to see themselves and (on labeled systems)
6184 6189   * the zones they dominate.
6185 6190   */
6186 6191  static zoneid_t
6187 6192  zone_lookup(const char *zone_name)
6188 6193  {
6189 6194          char *kname;
6190 6195          zone_t *zone;
6191 6196          zoneid_t zoneid;
6192 6197          int err;
6193 6198  
6194 6199          if (zone_name == NULL) {
6195 6200                  /* return caller's zone id */
6196 6201                  return (getzoneid());
6197 6202          }
6198 6203  
6199 6204          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6200 6205          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6201 6206                  kmem_free(kname, ZONENAME_MAX);
6202 6207                  return (set_errno(err));
6203 6208          }
6204 6209  
6205 6210          mutex_enter(&zonehash_lock);
6206 6211          zone = zone_find_all_by_name(kname);
6207 6212          kmem_free(kname, ZONENAME_MAX);
6208 6213          /*
6209 6214           * In a non-global zone, can only lookup global and own name.
6210 6215           * In Trusted Extensions zone label dominance rules apply.
6211 6216           */
6212 6217          if (zone == NULL ||
6213 6218              zone_status_get(zone) < ZONE_IS_READY ||
6214 6219              !zone_list_access(zone)) {
6215 6220                  mutex_exit(&zonehash_lock);
6216 6221                  return (set_errno(EINVAL));
6217 6222          } else {
6218 6223                  zoneid = zone->zone_id;
6219 6224                  mutex_exit(&zonehash_lock);
6220 6225                  return (zoneid);
6221 6226          }
6222 6227  }
6223 6228  
6224 6229  static int
6225 6230  zone_version(int *version_arg)
6226 6231  {
6227 6232          int version = ZONE_SYSCALL_API_VERSION;
6228 6233  
6229 6234          if (copyout(&version, version_arg, sizeof (int)) != 0)
6230 6235                  return (set_errno(EFAULT));
6231 6236          return (0);
6232 6237  }
6233 6238  
6234 6239  /* ARGSUSED */
6235 6240  long
6236 6241  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6237 6242  {
6238 6243          zone_def zs;
6239 6244          int err;
6240 6245  
6241 6246          switch (cmd) {
6242 6247          case ZONE_CREATE:
6243 6248                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6244 6249                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6245 6250                                  return (set_errno(EFAULT));
6246 6251                          }
6247 6252                  } else {
6248 6253  #ifdef _SYSCALL32_IMPL
6249 6254                          zone_def32 zs32;
6250 6255  
6251 6256                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6252 6257                                  return (set_errno(EFAULT));
6253 6258                          }
6254 6259                          zs.zone_name =
6255 6260                              (const char *)(unsigned long)zs32.zone_name;
6256 6261                          zs.zone_root =
6257 6262                              (const char *)(unsigned long)zs32.zone_root;
6258 6263                          zs.zone_privs =
6259 6264                              (const struct priv_set *)
6260 6265                              (unsigned long)zs32.zone_privs;
6261 6266                          zs.zone_privssz = zs32.zone_privssz;
6262 6267                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6263 6268                          zs.rctlbufsz = zs32.rctlbufsz;
6264 6269                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6265 6270                          zs.zfsbufsz = zs32.zfsbufsz;
6266 6271                          zs.extended_error =
6267 6272                              (int *)(unsigned long)zs32.extended_error;
6268 6273                          zs.match = zs32.match;
6269 6274                          zs.doi = zs32.doi;
6270 6275                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6271 6276                          zs.flags = zs32.flags;
6272 6277  #else
6273 6278                          panic("get_udatamodel() returned bogus result\n");
6274 6279  #endif
6275 6280                  }
6276 6281  
6277 6282                  return (zone_create(zs.zone_name, zs.zone_root,
6278 6283                      zs.zone_privs, zs.zone_privssz,
6279 6284                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6280 6285                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6281 6286                      zs.extended_error, zs.match, zs.doi,
6282 6287                      zs.label, zs.flags));
6283 6288          case ZONE_BOOT:
6284 6289                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6285 6290          case ZONE_DESTROY:
6286 6291                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6287 6292          case ZONE_GETATTR:
6288 6293                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6289 6294                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6290 6295          case ZONE_SETATTR:
6291 6296                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6292 6297                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6293 6298          case ZONE_ENTER:
6294 6299                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6295 6300          case ZONE_LIST:
6296 6301                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6297 6302          case ZONE_SHUTDOWN:
6298 6303                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6299 6304          case ZONE_LOOKUP:
6300 6305                  return (zone_lookup((const char *)arg1));
6301 6306          case ZONE_VERSION:
6302 6307                  return (zone_version((int *)arg1));
6303 6308          case ZONE_ADD_DATALINK:
6304 6309                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6305 6310                      (datalink_id_t)(uintptr_t)arg2));
6306 6311          case ZONE_DEL_DATALINK:
6307 6312                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6308 6313                      (datalink_id_t)(uintptr_t)arg2));
6309 6314          case ZONE_CHECK_DATALINK: {
6310 6315                  zoneid_t        zoneid;
6311 6316                  boolean_t       need_copyout;
6312 6317  
6313 6318                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6314 6319                          return (EFAULT);
6315 6320                  need_copyout = (zoneid == ALL_ZONES);
6316 6321                  err = zone_check_datalink(&zoneid,
6317 6322                      (datalink_id_t)(uintptr_t)arg2);
6318 6323                  if (err == 0 && need_copyout) {
6319 6324                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6320 6325                                  err = EFAULT;
6321 6326                  }
6322 6327                  return (err == 0 ? 0 : set_errno(err));
6323 6328          }
6324 6329          case ZONE_LIST_DATALINK:
6325 6330                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6326 6331                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6327 6332          default:
6328 6333                  return (set_errno(EINVAL));
6329 6334          }
6330 6335  }
6331 6336  
6332 6337  struct zarg {
6333 6338          zone_t *zone;
6334 6339          zone_cmd_arg_t arg;
6335 6340  };
6336 6341  
6337 6342  static int
6338 6343  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6339 6344  {
6340 6345          char *buf;
6341 6346          size_t buflen;
6342 6347          int error;
6343 6348  
6344 6349          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6345 6350          buf = kmem_alloc(buflen, KM_SLEEP);
6346 6351          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6347 6352          error = door_ki_open(buf, doorp);
6348 6353          kmem_free(buf, buflen);
6349 6354          return (error);
6350 6355  }
6351 6356  
6352 6357  static void
6353 6358  zone_release_door(door_handle_t *doorp)
6354 6359  {
6355 6360          door_ki_rele(*doorp);
6356 6361          *doorp = NULL;
6357 6362  }
6358 6363  
6359 6364  static void
6360 6365  zone_ki_call_zoneadmd(struct zarg *zargp)
6361 6366  {
6362 6367          door_handle_t door = NULL;
6363 6368          door_arg_t darg, save_arg;
6364 6369          char *zone_name;
6365 6370          size_t zone_namelen;
6366 6371          zoneid_t zoneid;
6367 6372          zone_t *zone;
6368 6373          zone_cmd_arg_t arg;
6369 6374          uint64_t uniqid;
6370 6375          size_t size;
6371 6376          int error;
6372 6377          int retry;
6373 6378  
6374 6379          zone = zargp->zone;
6375 6380          arg = zargp->arg;
6376 6381          kmem_free(zargp, sizeof (*zargp));
6377 6382  
6378 6383          zone_namelen = strlen(zone->zone_name) + 1;
6379 6384          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6380 6385          bcopy(zone->zone_name, zone_name, zone_namelen);
6381 6386          zoneid = zone->zone_id;
6382 6387          uniqid = zone->zone_uniqid;
6383 6388          /*
6384 6389           * zoneadmd may be down, but at least we can empty out the zone.
6385 6390           * We can ignore the return value of zone_empty() since we're called
6386 6391           * from a kernel thread and know we won't be delivered any signals.
6387 6392           */
6388 6393          ASSERT(curproc == &p0);
6389 6394          (void) zone_empty(zone);
6390 6395          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6391 6396          zone_rele(zone);
6392 6397  
6393 6398          size = sizeof (arg);
6394 6399          darg.rbuf = (char *)&arg;
6395 6400          darg.data_ptr = (char *)&arg;
6396 6401          darg.rsize = size;
6397 6402          darg.data_size = size;
6398 6403          darg.desc_ptr = NULL;
6399 6404          darg.desc_num = 0;
6400 6405  
6401 6406          save_arg = darg;
6402 6407          /*
6403 6408           * Since we're not holding a reference to the zone, any number of
6404 6409           * things can go wrong, including the zone disappearing before we get a
6405 6410           * chance to talk to zoneadmd.
6406 6411           */
6407 6412          for (retry = 0; /* forever */; retry++) {
6408 6413                  if (door == NULL &&
6409 6414                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6410 6415                          goto next;
6411 6416                  }
6412 6417                  ASSERT(door != NULL);
6413 6418  
6414 6419                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6415 6420                      SIZE_MAX, 0)) == 0) {
6416 6421                          break;
6417 6422                  }
6418 6423                  switch (error) {
6419 6424                  case EINTR:
6420 6425                          /* FALLTHROUGH */
6421 6426                  case EAGAIN:    /* process may be forking */
6422 6427                          /*
6423 6428                           * Back off for a bit
6424 6429                           */
6425 6430                          break;
6426 6431                  case EBADF:
6427 6432                          zone_release_door(&door);
6428 6433                          if (zone_lookup_door(zone_name, &door) != 0) {
6429 6434                                  /*
6430 6435                                   * zoneadmd may be dead, but it may come back to
6431 6436                                   * life later.
6432 6437                                   */
6433 6438                                  break;
6434 6439                          }
6435 6440                          break;
6436 6441                  default:
6437 6442                          cmn_err(CE_WARN,
6438 6443                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6439 6444                              error);
6440 6445                          goto out;
6441 6446                  }
6442 6447  next:
6443 6448                  /*
6444 6449                   * If this isn't the same zone_t that we originally had in mind,
6445 6450                   * then this is the same as if two kadmin requests come in at
6446 6451                   * the same time: the first one wins.  This means we lose, so we
6447 6452                   * bail.
6448 6453                   */
6449 6454                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6450 6455                          /*
6451 6456                           * Problem is solved.
6452 6457                           */
6453 6458                          break;
6454 6459                  }
6455 6460                  if (zone->zone_uniqid != uniqid) {
6456 6461                          /*
6457 6462                           * zoneid recycled
6458 6463                           */
6459 6464                          zone_rele(zone);
6460 6465                          break;
6461 6466                  }
6462 6467                  /*
6463 6468                   * We could zone_status_timedwait(), but there doesn't seem to
6464 6469                   * be much point in doing that (plus, it would mean that
6465 6470                   * zone_free() isn't called until this thread exits).
6466 6471                   */
6467 6472                  zone_rele(zone);
6468 6473                  delay(hz);
6469 6474                  darg = save_arg;
6470 6475          }
6471 6476  out:
6472 6477          if (door != NULL) {
6473 6478                  zone_release_door(&door);
6474 6479          }
6475 6480          kmem_free(zone_name, zone_namelen);
6476 6481          thread_exit();
6477 6482  }
6478 6483  
6479 6484  /*
6480 6485   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6481 6486   * kadmin().  The caller is a process in the zone.
6482 6487   *
6483 6488   * In order to shutdown the zone, we will hand off control to zoneadmd
6484 6489   * (running in the global zone) via a door.  We do a half-hearted job at
6485 6490   * killing all processes in the zone, create a kernel thread to contact
6486 6491   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6487 6492   * a form of generation number used to let zoneadmd (as well as
6488 6493   * zone_destroy()) know exactly which zone they're re talking about.
6489 6494   */
6490 6495  int
6491 6496  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6492 6497  {
6493 6498          struct zarg *zargp;
6494 6499          zone_cmd_t zcmd;
6495 6500          zone_t *zone;
6496 6501  
6497 6502          zone = curproc->p_zone;
6498 6503          ASSERT(getzoneid() != GLOBAL_ZONEID);
6499 6504  
6500 6505          switch (cmd) {
6501 6506          case A_SHUTDOWN:
6502 6507                  switch (fcn) {
6503 6508                  case AD_HALT:
6504 6509                  case AD_POWEROFF:
6505 6510                          zcmd = Z_HALT;
6506 6511                          break;
6507 6512                  case AD_BOOT:
6508 6513                          zcmd = Z_REBOOT;
6509 6514                          break;
6510 6515                  case AD_IBOOT:
6511 6516                  case AD_SBOOT:
6512 6517                  case AD_SIBOOT:
6513 6518                  case AD_NOSYNC:
6514 6519                          return (ENOTSUP);
6515 6520                  default:
6516 6521                          return (EINVAL);
6517 6522                  }
6518 6523                  break;
6519 6524          case A_REBOOT:
6520 6525                  zcmd = Z_REBOOT;
6521 6526                  break;
6522 6527          case A_FTRACE:
6523 6528          case A_REMOUNT:
6524 6529          case A_FREEZE:
6525 6530          case A_DUMP:
6526 6531          case A_CONFIG:
6527 6532                  return (ENOTSUP);
6528 6533          default:
6529 6534                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6530 6535                  return (EINVAL);
6531 6536          }
6532 6537  
6533 6538          if (secpolicy_zone_admin(credp, B_FALSE))
6534 6539                  return (EPERM);
6535 6540          mutex_enter(&zone_status_lock);
6536 6541  
6537 6542          /*
6538 6543           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6539 6544           * is in the zone.
6540 6545           */
6541 6546          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6542 6547          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6543 6548                  /*
6544 6549                   * This zone is already on its way down.
6545 6550                   */
6546 6551                  mutex_exit(&zone_status_lock);
6547 6552                  return (0);
6548 6553          }
6549 6554          /*
6550 6555           * Prevent future zone_enter()s
6551 6556           */
6552 6557          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6553 6558          mutex_exit(&zone_status_lock);
6554 6559  
6555 6560          /*
6556 6561           * Kill everyone now and call zoneadmd later.
6557 6562           * zone_ki_call_zoneadmd() will do a more thorough job of this
6558 6563           * later.
6559 6564           */
6560 6565          killall(zone->zone_id);
6561 6566          /*
6562 6567           * Now, create the thread to contact zoneadmd and do the rest of the
6563 6568           * work.  This thread can't be created in our zone otherwise
6564 6569           * zone_destroy() would deadlock.
6565 6570           */
6566 6571          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6567 6572          zargp->arg.cmd = zcmd;
6568 6573          zargp->arg.uniqid = zone->zone_uniqid;
6569 6574          zargp->zone = zone;
6570 6575          (void) strcpy(zargp->arg.locale, "C");
6571 6576          /* mdep was already copied in for us by uadmin */
6572 6577          if (mdep != NULL)
6573 6578                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6574 6579                      sizeof (zargp->arg.bootbuf));
6575 6580          zone_hold(zone);
6576 6581  
6577 6582          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6578 6583              TS_RUN, minclsyspri);
6579 6584          exit(CLD_EXITED, 0);
6580 6585  
6581 6586          return (EINVAL);
6582 6587  }
6583 6588  
6584 6589  /*
6585 6590   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6586 6591   * status to ZONE_IS_SHUTTING_DOWN.
6587 6592   *
6588 6593   * This function also shuts down all running zones to ensure that they won't
6589 6594   * fork new processes.
6590 6595   */
6591 6596  void
6592 6597  zone_shutdown_global(void)
6593 6598  {
6594 6599          zone_t *current_zonep;
6595 6600  
6596 6601          ASSERT(INGLOBALZONE(curproc));
6597 6602          mutex_enter(&zonehash_lock);
6598 6603          mutex_enter(&zone_status_lock);
6599 6604  
6600 6605          /* Modify the global zone's status first. */
6601 6606          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6602 6607          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6603 6608  
6604 6609          /*
6605 6610           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6606 6611           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6607 6612           * could cause assertions to fail (e.g., assertions about a zone's
6608 6613           * state during initialization, readying, or booting) or produce races.
6609 6614           * We'll let threads continue to initialize and ready new zones: they'll
6610 6615           * fail to boot the new zones when they see that the global zone is
6611 6616           * shutting down.
6612 6617           */
6613 6618          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6614 6619              current_zonep = list_next(&zone_active, current_zonep)) {
6615 6620                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6616 6621                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6617 6622          }
6618 6623          mutex_exit(&zone_status_lock);
6619 6624          mutex_exit(&zonehash_lock);
6620 6625  }
6621 6626  
6622 6627  /*
6623 6628   * Returns true if the named dataset is visible in the current zone.
6624 6629   * The 'write' parameter is set to 1 if the dataset is also writable.
6625 6630   */
6626 6631  int
6627 6632  zone_dataset_visible(const char *dataset, int *write)
6628 6633  {
6629 6634          static int zfstype = -1;
6630 6635          zone_dataset_t *zd;
6631 6636          size_t len;
6632 6637          zone_t *zone = curproc->p_zone;
6633 6638          const char *name = NULL;
6634 6639          vfs_t *vfsp = NULL;
6635 6640  
6636 6641          if (dataset[0] == '\0')
6637 6642                  return (0);
6638 6643  
6639 6644          /*
6640 6645           * Walk the list once, looking for datasets which match exactly, or
6641 6646           * specify a dataset underneath an exported dataset.  If found, return
6642 6647           * true and note that it is writable.
6643 6648           */
6644 6649          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6645 6650              zd = list_next(&zone->zone_datasets, zd)) {
6646 6651  
6647 6652                  len = strlen(zd->zd_dataset);
6648 6653                  if (strlen(dataset) >= len &&
6649 6654                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6650 6655                      (dataset[len] == '\0' || dataset[len] == '/' ||
6651 6656                      dataset[len] == '@')) {
6652 6657                          if (write)
6653 6658                                  *write = 1;
6654 6659                          return (1);
6655 6660                  }
6656 6661          }
6657 6662  
6658 6663          /*
6659 6664           * Walk the list a second time, searching for datasets which are parents
6660 6665           * of exported datasets.  These should be visible, but read-only.
6661 6666           *
6662 6667           * Note that we also have to support forms such as 'pool/dataset/', with
6663 6668           * a trailing slash.
6664 6669           */
6665 6670          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6666 6671              zd = list_next(&zone->zone_datasets, zd)) {
6667 6672  
6668 6673                  len = strlen(dataset);
6669 6674                  if (dataset[len - 1] == '/')
6670 6675                          len--;  /* Ignore trailing slash */
6671 6676                  if (len < strlen(zd->zd_dataset) &&
6672 6677                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6673 6678                      zd->zd_dataset[len] == '/') {
6674 6679                          if (write)
6675 6680                                  *write = 0;
6676 6681                          return (1);
6677 6682                  }
6678 6683          }
6679 6684  
6680 6685          /*
6681 6686           * We reach here if the given dataset is not found in the zone_dataset
6682 6687           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6683 6688           * instead of delegation. For this we search for the dataset in the
6684 6689           * zone_vfslist of this zone. If found, return true and note that it is
6685 6690           * not writable.
6686 6691           */
6687 6692  
6688 6693          /*
6689 6694           * Initialize zfstype if it is not initialized yet.
6690 6695           */
6691 6696          if (zfstype == -1) {
6692 6697                  struct vfssw *vswp = vfs_getvfssw("zfs");
6693 6698                  zfstype = vswp - vfssw;
6694 6699                  vfs_unrefvfssw(vswp);
6695 6700          }
6696 6701  
6697 6702          vfs_list_read_lock();
6698 6703          vfsp = zone->zone_vfslist;
6699 6704          do {
6700 6705                  ASSERT(vfsp);
6701 6706                  if (vfsp->vfs_fstype == zfstype) {
6702 6707                          name = refstr_value(vfsp->vfs_resource);
6703 6708  
6704 6709                          /*
6705 6710                           * Check if we have an exact match.
6706 6711                           */
6707 6712                          if (strcmp(dataset, name) == 0) {
6708 6713                                  vfs_list_unlock();
6709 6714                                  if (write)
6710 6715                                          *write = 0;
6711 6716                                  return (1);
6712 6717                          }
6713 6718                          /*
6714 6719                           * We need to check if we are looking for parents of
6715 6720                           * a dataset. These should be visible, but read-only.
6716 6721                           */
6717 6722                          len = strlen(dataset);
6718 6723                          if (dataset[len - 1] == '/')
6719 6724                                  len--;
6720 6725  
6721 6726                          if (len < strlen(name) &&
6722 6727                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6723 6728                                  vfs_list_unlock();
6724 6729                                  if (write)
6725 6730                                          *write = 0;
6726 6731                                  return (1);
6727 6732                          }
6728 6733                  }
6729 6734                  vfsp = vfsp->vfs_zone_next;
6730 6735          } while (vfsp != zone->zone_vfslist);
6731 6736  
6732 6737          vfs_list_unlock();
6733 6738          return (0);
6734 6739  }
6735 6740  
6736 6741  /*
6737 6742   * zone_find_by_any_path() -
6738 6743   *
6739 6744   * kernel-private routine similar to zone_find_by_path(), but which
6740 6745   * effectively compares against zone paths rather than zonerootpath
6741 6746   * (i.e., the last component of zonerootpaths, which should be "root/",
6742 6747   * are not compared.)  This is done in order to accurately identify all
6743 6748   * paths, whether zone-visible or not, including those which are parallel
6744 6749   * to /root/, such as /dev/, /home/, etc...
6745 6750   *
6746 6751   * If the specified path does not fall under any zone path then global
6747 6752   * zone is returned.
6748 6753   *
6749 6754   * The treat_abs parameter indicates whether the path should be treated as
6750 6755   * an absolute path although it does not begin with "/".  (This supports
6751 6756   * nfs mount syntax such as host:any/path.)
6752 6757   *
6753 6758   * The caller is responsible for zone_rele of the returned zone.
6754 6759   */
6755 6760  zone_t *
6756 6761  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6757 6762  {
6758 6763          zone_t *zone;
6759 6764          int path_offset = 0;
6760 6765  
6761 6766          if (path == NULL) {
6762 6767                  zone_hold(global_zone);
6763 6768                  return (global_zone);
6764 6769          }
6765 6770  
6766 6771          if (*path != '/') {
6767 6772                  ASSERT(treat_abs);
6768 6773                  path_offset = 1;
6769 6774          }
6770 6775  
6771 6776          mutex_enter(&zonehash_lock);
6772 6777          for (zone = list_head(&zone_active); zone != NULL;
6773 6778              zone = list_next(&zone_active, zone)) {
6774 6779                  char    *c;
6775 6780                  size_t  pathlen;
6776 6781                  char *rootpath_start;
6777 6782  
6778 6783                  if (zone == global_zone)        /* skip global zone */
6779 6784                          continue;
6780 6785  
6781 6786                  /* scan backwards to find start of last component */
6782 6787                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6783 6788                  do {
6784 6789                          c--;
6785 6790                  } while (*c != '/');
6786 6791  
6787 6792                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
6788 6793                  rootpath_start = (zone->zone_rootpath + path_offset);
6789 6794                  if (strncmp(path, rootpath_start, pathlen) == 0)
6790 6795                          break;
6791 6796          }
6792 6797          if (zone == NULL)
6793 6798                  zone = global_zone;
6794 6799          zone_hold(zone);
6795 6800          mutex_exit(&zonehash_lock);
6796 6801          return (zone);
6797 6802  }
6798 6803  
6799 6804  /*
6800 6805   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6801 6806   * zone_dl_t pointer if found, and NULL otherwise.
6802 6807   */
6803 6808  static zone_dl_t *
6804 6809  zone_find_dl(zone_t *zone, datalink_id_t linkid)
6805 6810  {
6806 6811          zone_dl_t *zdl;
6807 6812  
6808 6813          ASSERT(mutex_owned(&zone->zone_lock));
6809 6814          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6810 6815              zdl = list_next(&zone->zone_dl_list, zdl)) {
6811 6816                  if (zdl->zdl_id == linkid)
6812 6817                          break;
6813 6818          }
6814 6819          return (zdl);
6815 6820  }
6816 6821  
6817 6822  static boolean_t
6818 6823  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6819 6824  {
6820 6825          boolean_t exists;
6821 6826  
6822 6827          mutex_enter(&zone->zone_lock);
6823 6828          exists = (zone_find_dl(zone, linkid) != NULL);
6824 6829          mutex_exit(&zone->zone_lock);
6825 6830          return (exists);
6826 6831  }
6827 6832  
6828 6833  /*
6829 6834   * Add an data link name for the zone.
6830 6835   */
6831 6836  static int
6832 6837  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6833 6838  {
6834 6839          zone_dl_t *zdl;
6835 6840          zone_t *zone;
6836 6841          zone_t *thiszone;
6837 6842  
6838 6843          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6839 6844                  return (set_errno(ENXIO));
6840 6845  
6841 6846          /* Verify that the datalink ID doesn't already belong to a zone. */
6842 6847          mutex_enter(&zonehash_lock);
6843 6848          for (zone = list_head(&zone_active); zone != NULL;
6844 6849              zone = list_next(&zone_active, zone)) {
6845 6850                  if (zone_dl_exists(zone, linkid)) {
6846 6851                          mutex_exit(&zonehash_lock);
6847 6852                          zone_rele(thiszone);
6848 6853                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6849 6854                  }
6850 6855          }
6851 6856  
6852 6857          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6853 6858          zdl->zdl_id = linkid;
6854 6859          zdl->zdl_net = NULL;
6855 6860          mutex_enter(&thiszone->zone_lock);
6856 6861          list_insert_head(&thiszone->zone_dl_list, zdl);
6857 6862          mutex_exit(&thiszone->zone_lock);
6858 6863          mutex_exit(&zonehash_lock);
6859 6864          zone_rele(thiszone);
6860 6865          return (0);
6861 6866  }
6862 6867  
6863 6868  static int
6864 6869  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6865 6870  {
6866 6871          zone_dl_t *zdl;
6867 6872          zone_t *zone;
6868 6873          int err = 0;
6869 6874  
6870 6875          if ((zone = zone_find_by_id(zoneid)) == NULL)
6871 6876                  return (set_errno(EINVAL));
6872 6877  
6873 6878          mutex_enter(&zone->zone_lock);
6874 6879          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6875 6880                  err = ENXIO;
6876 6881          } else {
6877 6882                  list_remove(&zone->zone_dl_list, zdl);
6878 6883                  if (zdl->zdl_net != NULL)
6879 6884                          nvlist_free(zdl->zdl_net);
6880 6885                  kmem_free(zdl, sizeof (zone_dl_t));
6881 6886          }
6882 6887          mutex_exit(&zone->zone_lock);
6883 6888          zone_rele(zone);
6884 6889          return (err == 0 ? 0 : set_errno(err));
6885 6890  }
6886 6891  
6887 6892  /*
6888 6893   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6889 6894   * the linkid.  Otherwise we just check if the specified zoneidp has been
6890 6895   * assigned the supplied linkid.
6891 6896   */
6892 6897  int
6893 6898  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6894 6899  {
6895 6900          zone_t *zone;
6896 6901          int err = ENXIO;
6897 6902  
6898 6903          if (*zoneidp != ALL_ZONES) {
6899 6904                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6900 6905                          if (zone_dl_exists(zone, linkid))
6901 6906                                  err = 0;
6902 6907                          zone_rele(zone);
6903 6908                  }
6904 6909                  return (err);
6905 6910          }
6906 6911  
6907 6912          mutex_enter(&zonehash_lock);
6908 6913          for (zone = list_head(&zone_active); zone != NULL;
6909 6914              zone = list_next(&zone_active, zone)) {
6910 6915                  if (zone_dl_exists(zone, linkid)) {
6911 6916                          *zoneidp = zone->zone_id;
6912 6917                          err = 0;
6913 6918                          break;
6914 6919                  }
6915 6920          }
6916 6921          mutex_exit(&zonehash_lock);
6917 6922          return (err);
6918 6923  }
6919 6924  
6920 6925  /*
6921 6926   * Get the list of datalink IDs assigned to a zone.
6922 6927   *
6923 6928   * On input, *nump is the number of datalink IDs that can fit in the supplied
6924 6929   * idarray.  Upon return, *nump is either set to the number of datalink IDs
6925 6930   * that were placed in the array if the array was large enough, or to the
6926 6931   * number of datalink IDs that the function needs to place in the array if the
6927 6932   * array is too small.
6928 6933   */
6929 6934  static int
6930 6935  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6931 6936  {
6932 6937          uint_t num, dlcount;
6933 6938          zone_t *zone;
6934 6939          zone_dl_t *zdl;
6935 6940          datalink_id_t *idptr = idarray;
6936 6941  
6937 6942          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6938 6943                  return (set_errno(EFAULT));
6939 6944          if ((zone = zone_find_by_id(zoneid)) == NULL)
6940 6945                  return (set_errno(ENXIO));
6941 6946  
6942 6947          num = 0;
6943 6948          mutex_enter(&zone->zone_lock);
6944 6949          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6945 6950              zdl = list_next(&zone->zone_dl_list, zdl)) {
6946 6951                  /*
6947 6952                   * If the list is bigger than what the caller supplied, just
6948 6953                   * count, don't do copyout.
6949 6954                   */
6950 6955                  if (++num > dlcount)
6951 6956                          continue;
6952 6957                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6953 6958                          mutex_exit(&zone->zone_lock);
6954 6959                          zone_rele(zone);
6955 6960                          return (set_errno(EFAULT));
6956 6961                  }
6957 6962                  idptr++;
6958 6963          }
6959 6964          mutex_exit(&zone->zone_lock);
6960 6965          zone_rele(zone);
6961 6966  
6962 6967          /* Increased or decreased, caller should be notified. */
6963 6968          if (num != dlcount) {
6964 6969                  if (copyout(&num, nump, sizeof (num)) != 0)
6965 6970                          return (set_errno(EFAULT));
6966 6971          }
6967 6972          return (0);
6968 6973  }
6969 6974  
6970 6975  /*
6971 6976   * Public interface for looking up a zone by zoneid. It's a customized version
6972 6977   * for netstack_zone_create(). It can only be called from the zsd create
6973 6978   * callbacks, since it doesn't have reference on the zone structure hence if
6974 6979   * it is called elsewhere the zone could disappear after the zonehash_lock
6975 6980   * is dropped.
6976 6981   *
6977 6982   * Furthermore it
6978 6983   * 1. Doesn't check the status of the zone.
6979 6984   * 2. It will be called even before zone_init is called, in that case the
6980 6985   *    address of zone0 is returned directly, and netstack_zone_create()
6981 6986   *    will only assign a value to zone0.zone_netstack, won't break anything.
6982 6987   * 3. Returns without the zone being held.
6983 6988   */
6984 6989  zone_t *
6985 6990  zone_find_by_id_nolock(zoneid_t zoneid)
6986 6991  {
6987 6992          zone_t *zone;
6988 6993  
6989 6994          mutex_enter(&zonehash_lock);
6990 6995          if (zonehashbyid == NULL)
6991 6996                  zone = &zone0;
6992 6997          else
6993 6998                  zone = zone_find_all_by_id(zoneid);
6994 6999          mutex_exit(&zonehash_lock);
6995 7000          return (zone);
6996 7001  }
6997 7002  
6998 7003  /*
6999 7004   * Walk the datalinks for a given zone
7000 7005   */
7001 7006  int
7002 7007  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7003 7008      void *data)
7004 7009  {
7005 7010          zone_t          *zone;
7006 7011          zone_dl_t       *zdl;
7007 7012          datalink_id_t   *idarray;
7008 7013          uint_t          idcount = 0;
7009 7014          int             i, ret = 0;
7010 7015  
7011 7016          if ((zone = zone_find_by_id(zoneid)) == NULL)
7012 7017                  return (ENOENT);
7013 7018  
7014 7019          /*
7015 7020           * We first build an array of linkid's so that we can walk these and
7016 7021           * execute the callback with the zone_lock dropped.
7017 7022           */
7018 7023          mutex_enter(&zone->zone_lock);
7019 7024          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7020 7025              zdl = list_next(&zone->zone_dl_list, zdl)) {
7021 7026                  idcount++;
7022 7027          }
7023 7028  
7024 7029          if (idcount == 0) {
7025 7030                  mutex_exit(&zone->zone_lock);
7026 7031                  zone_rele(zone);
7027 7032                  return (0);
7028 7033          }
7029 7034  
7030 7035          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7031 7036          if (idarray == NULL) {
7032 7037                  mutex_exit(&zone->zone_lock);
7033 7038                  zone_rele(zone);
7034 7039                  return (ENOMEM);
7035 7040          }
7036 7041  
7037 7042          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7038 7043              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7039 7044                  idarray[i] = zdl->zdl_id;
7040 7045          }
7041 7046  
7042 7047          mutex_exit(&zone->zone_lock);
7043 7048  
7044 7049          for (i = 0; i < idcount && ret == 0; i++) {
7045 7050                  if ((ret = (*cb)(idarray[i], data)) != 0)
7046 7051                          break;
7047 7052          }
7048 7053  
7049 7054          zone_rele(zone);
7050 7055          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7051 7056          return (ret);
7052 7057  }
7053 7058  
7054 7059  static char *
7055 7060  zone_net_type2name(int type)
7056 7061  {
7057 7062          switch (type) {
7058 7063          case ZONE_NETWORK_ADDRESS:
7059 7064                  return (ZONE_NET_ADDRNAME);
7060 7065          case ZONE_NETWORK_DEFROUTER:
7061 7066                  return (ZONE_NET_RTRNAME);
7062 7067          default:
7063 7068                  return (NULL);
7064 7069          }
7065 7070  }
7066 7071  
7067 7072  static int
7068 7073  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7069 7074  {
7070 7075          zone_t *zone;
7071 7076          zone_dl_t *zdl;
7072 7077          nvlist_t *nvl;
7073 7078          int err = 0;
7074 7079          uint8_t *new = NULL;
7075 7080          char *nvname;
7076 7081          int bufsize;
7077 7082          datalink_id_t linkid = znbuf->zn_linkid;
7078 7083  
7079 7084          if (secpolicy_zone_config(CRED()) != 0)
7080 7085                  return (set_errno(EPERM));
7081 7086  
7082 7087          if (zoneid == GLOBAL_ZONEID)
7083 7088                  return (set_errno(EINVAL));
7084 7089  
7085 7090          nvname = zone_net_type2name(znbuf->zn_type);
7086 7091          bufsize = znbuf->zn_len;
7087 7092          new = znbuf->zn_val;
7088 7093          if (nvname == NULL)
7089 7094                  return (set_errno(EINVAL));
7090 7095  
7091 7096          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7092 7097                  return (set_errno(EINVAL));
7093 7098          }
7094 7099  
7095 7100          mutex_enter(&zone->zone_lock);
7096 7101          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7097 7102                  err = ENXIO;
7098 7103                  goto done;
7099 7104          }
7100 7105          if ((nvl = zdl->zdl_net) == NULL) {
7101 7106                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7102 7107                          err = ENOMEM;
7103 7108                          goto done;
7104 7109                  } else {
7105 7110                          zdl->zdl_net = nvl;
7106 7111                  }
7107 7112          }
7108 7113          if (nvlist_exists(nvl, nvname)) {
7109 7114                  err = EINVAL;
7110 7115                  goto done;
7111 7116          }
7112 7117          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7113 7118          ASSERT(err == 0);
7114 7119  done:
7115 7120          mutex_exit(&zone->zone_lock);
7116 7121          zone_rele(zone);
7117 7122          if (err != 0)
7118 7123                  return (set_errno(err));
7119 7124          else
7120 7125                  return (0);
7121 7126  }
7122 7127  
7123 7128  static int
7124 7129  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7125 7130  {
7126 7131          zone_t *zone;
7127 7132          zone_dl_t *zdl;
7128 7133          nvlist_t *nvl;
7129 7134          uint8_t *ptr;
7130 7135          uint_t psize;
7131 7136          int err = 0;
7132 7137          char *nvname;
7133 7138          int bufsize;
7134 7139          void *buf;
7135 7140          datalink_id_t linkid = znbuf->zn_linkid;
7136 7141  
7137 7142          if (zoneid == GLOBAL_ZONEID)
7138 7143                  return (set_errno(EINVAL));
7139 7144  
7140 7145          nvname = zone_net_type2name(znbuf->zn_type);
7141 7146          bufsize = znbuf->zn_len;
7142 7147          buf = znbuf->zn_val;
7143 7148  
7144 7149          if (nvname == NULL)
7145 7150                  return (set_errno(EINVAL));
7146 7151          if ((zone = zone_find_by_id(zoneid)) == NULL)
7147 7152                  return (set_errno(EINVAL));
7148 7153  
7149 7154          mutex_enter(&zone->zone_lock);
7150 7155          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7151 7156                  err = ENXIO;
7152 7157                  goto done;
7153 7158          }
7154 7159          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7155 7160                  err = ENOENT;
7156 7161                  goto done;
7157 7162          }
7158 7163          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7159 7164          ASSERT(err == 0);
7160 7165  
7161 7166          if (psize > bufsize) {
7162 7167                  err = ENOBUFS;
7163 7168                  goto done;
7164 7169          }
7165 7170          znbuf->zn_len = psize;
7166 7171          bcopy(ptr, buf, psize);
7167 7172  done:
7168 7173          mutex_exit(&zone->zone_lock);
7169 7174          zone_rele(zone);
7170 7175          if (err != 0)
7171 7176                  return (set_errno(err));
7172 7177          else
7173 7178                  return (0);
7174 7179  }

↓ open down ↓

3197 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX