il_7029-4 Wdiff usr/src/uts/common/os/zone.c

Print this page

7029 want per-process exploit mitigation features (secflags)
7030 want basic address space layout randomization (aslr)
7031 noexec_user_stack should be a secflag
7032 want a means to forbid mappings around NULL.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015, Joyent Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Zones
  29   29   *
  30   30   *   A zone is a named collection of processes, namespace constraints,
  31   31   *   and other system resources which comprise a secure and manageable
  32   32   *   application containment facility.
  33   33   *
  34   34   *   Zones (represented by the reference counted zone_t) are tracked in
  35   35   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36   36   *   (zoneid_t) are used to track zone association.  Zone IDs are
  37   37   *   dynamically generated when the zone is created; if a persistent
  38   38   *   identifier is needed (core files, accounting logs, audit trail,
  39   39   *   etc.), the zone name should be used.
  40   40   *
  41   41   *
  42   42   *   Global Zone:
  43   43   *
  44   44   *   The global zone (zoneid 0) is automatically associated with all
  45   45   *   system resources that have not been bound to a user-created zone.
  46   46   *   This means that even systems where zones are not in active use
  47   47   *   have a global zone, and all processes, mounts, etc. are
  48   48   *   associated with that zone.  The global zone is generally
  49   49   *   unconstrained in terms of privileges and access, though the usual
  50   50   *   credential and privilege based restrictions apply.
  51   51   *
  52   52   *
  53   53   *   Zone States:
  54   54   *
  55   55   *   The states in which a zone may be in and the transitions are as
  56   56   *   follows:
  57   57   *
  58   58   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59   59   *   initialized zone is added to the list of active zones on the system but
  60   60   *   isn't accessible.
  61   61   *
  62   62   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63   63   *   not yet completed. Not possible to enter the zone, but attributes can
  64   64   *   be retrieved.
  65   65   *
  66   66   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67   67   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68   68   *   executed.  A zone remains in this state until it transitions into
  69   69   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70   70   *
  71   71   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72   72   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73   73   *   state.
  74   74   *
  75   75   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76   76   *   successfully started init.   A zone remains in this state until
  77   77   *   zone_shutdown() is called.
  78   78   *
  79   79   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80   80   *   killing all processes running in the zone. The zone remains
  81   81   *   in this state until there are no more user processes running in the zone.
  82   82   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83   83   *   Since zone_shutdown() is restartable, it may be called successfully
  84   84   *   multiple times for the same zone_t.  Setting of the zone's state to
  85   85   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86   86   *   the zone's status without worrying about it being a moving target.
  87   87   *
  88   88   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89   89   *   are no more user processes in the zone.  The zone remains in this
  90   90   *   state until there are no more kernel threads associated with the
  91   91   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92   92   *   fail.
  93   93   *
  94   94   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95   95   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96   96   *   join the zone or create kernel threads therein.
  97   97   *
  98   98   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99   99   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  100   *   return NULL from now on.
 101  101   *
 102  102   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  103   *   processes or threads doing work on behalf of the zone.  The zone is
 104  104   *   removed from the list of active zones.  zone_destroy() returns, and
 105  105   *   the zone can be recreated.
 106  106   *
 107  107   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  108   *   callbacks are executed, and all memory associated with the zone is
 109  109   *   freed.
 110  110   *
 111  111   *   Threads can wait for the zone to enter a requested state by using
 112  112   *   zone_status_wait() or zone_status_timedwait() with the desired
 113  113   *   state passed in as an argument.  Zone state transitions are
 114  114   *   uni-directional; it is not possible to move back to an earlier state.
 115  115   *
 116  116   *
 117  117   *   Zone-Specific Data:
 118  118   *
 119  119   *   Subsystems needing to maintain zone-specific data can store that
 120  120   *   data using the ZSD mechanism.  This provides a zone-specific data
 121  121   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  122   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  123   *   to register callbacks to be invoked when a zone is created, shut
 124  124   *   down, or destroyed.  This can be used to initialize zone-specific
 125  125   *   data for new zones and to clean up when zones go away.
 126  126   *
 127  127   *
 128  128   *   Data Structures:
 129  129   *
 130  130   *   The per-zone structure (zone_t) is reference counted, and freed
 131  131   *   when all references are released.  zone_hold and zone_rele can be
 132  132   *   used to adjust the reference count.  In addition, reference counts
 133  133   *   associated with the cred_t structure are tracked separately using
 134  134   *   zone_cred_hold and zone_cred_rele.
 135  135   *
 136  136   *   Pointers to active zone_t's are stored in two hash tables; one
 137  137   *   for searching by id, the other for searching by name.  Lookups
 138  138   *   can be performed on either basis, using zone_find_by_id and
 139  139   *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  140   *   held, so zone_rele should be called when the pointer is no longer
 141  141   *   needed.  Zones can also be searched by path; zone_find_by_path
 142  142   *   returns the zone with which a path name is associated (global
 143  143   *   zone if the path is not within some other zone's file system
 144  144   *   hierarchy).  This currently requires iterating through each zone,
 145  145   *   so it is slower than an id or name search via a hash table.
 146  146   *
 147  147   *
 148  148   *   Locking:
 149  149   *
 150  150   *   zonehash_lock: This is a top-level global lock used to protect the
 151  151   *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  152   *       while this lock is held.
 153  153   *   zone_status_lock: This is a global lock protecting zone state.
 154  154   *       Zones cannot change state while this lock is held.  It also
 155  155   *       protects the list of kernel threads associated with a zone.
 156  156   *   zone_lock: This is a per-zone lock used to protect several fields of
 157  157   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  158   *       this lock means that the zone cannot go away.
 159  159   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  160   *       related to the zone.max-lwps rctl.
 161  161   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  163   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  164   *       currently just max_lofi
 165  165   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  166   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  167   *       list (a list of zones in the ZONE_IS_DEAD state).
 168  168   *
 169  169   *   Ordering requirements:
 170  170   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  171   *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  172   *
 173  173   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  174   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  175   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  176   *
 177  177   *   Blocking memory allocations are permitted while holding any of the
 178  178   *   zone locks.
 179  179   *
 180  180   *
 181  181   *   System Call Interface:
 182  182   *
 183  183   *   The zone subsystem can be managed and queried from user level with
 184  184   *   the following system calls (all subcodes of the primary "zone"
 185  185   *   system call):
 186  186   *   - zone_create: creates a zone with selected attributes (name,
 187  187   *     root path, privileges, resource controls, ZFS datasets)
 188  188   *   - zone_enter: allows the current process to enter a zone
 189  189   *   - zone_getattr: reports attributes of a zone
 190  190   *   - zone_setattr: set attributes of a zone
 191  191   *   - zone_boot: set 'init' running for the zone
 192  192   *   - zone_list: lists all zones active in the system
 193  193   *   - zone_lookup: looks up zone id based on name
 194  194   *   - zone_shutdown: initiates shutdown process (see states above)
 195  195   *   - zone_destroy: completes shutdown process (see states above)
 196  196   *
 197  197   */
 198  198  
 199  199  #include <sys/priv_impl.h>
 200  200  #include <sys/cred.h>
 201  201  #include <c2/audit.h>
 202  202  #include <sys/debug.h>
 203  203  #include <sys/file.h>
 204  204  #include <sys/kmem.h>
 205  205  #include <sys/kstat.h>
 206  206  #include <sys/mutex.h>
 207  207  #include <sys/note.h>
 208  208  #include <sys/pathname.h>
 209  209  #include <sys/proc.h>
 210  210  #include <sys/project.h>
 211  211  #include <sys/sysevent.h>
 212  212  #include <sys/task.h>
 213  213  #include <sys/systm.h>
 214  214  #include <sys/types.h>
 215  215  #include <sys/utsname.h>
 216  216  #include <sys/vnode.h>
 217  217  #include <sys/vfs.h>
 218  218  #include <sys/systeminfo.h>
 219  219  #include <sys/policy.h>
 220  220  #include <sys/cred_impl.h>
 221  221  #include <sys/contract_impl.h>
 222  222  #include <sys/contract/process_impl.h>
 223  223  #include <sys/class.h>
 224  224  #include <sys/pool.h>
 225  225  #include <sys/pool_pset.h>
 226  226  #include <sys/pset.h>
 227  227  #include <sys/strlog.h>
 228  228  #include <sys/sysmacros.h>
 229  229  #include <sys/callb.h>
 230  230  #include <sys/vmparam.h>
 231  231  #include <sys/corectl.h>
 232  232  #include <sys/ipc_impl.h>
 233  233  #include <sys/klpd.h>
 234  234  
 235  235  #include <sys/door.h>
 236  236  #include <sys/cpuvar.h>
 237  237  #include <sys/sdt.h>
 238  238  
 239  239  #include <sys/uadmin.h>
 240  240  #include <sys/session.h>
 241  241  #include <sys/cmn_err.h>
 242  242  #include <sys/modhash.h>
 243  243  #include <sys/sunddi.h>
 244  244  #include <sys/nvpair.h>
 245  245  #include <sys/rctl.h>
 246  246  #include <sys/fss.h>
 247  247  #include <sys/brand.h>
 248  248  #include <sys/zone.h>
 249  249  #include <net/if.h>
 250  250  #include <sys/cpucaps.h>
 251  251  #include <vm/seg.h>
 252  252  #include <sys/mac.h>
 253  253  
 254  254  /*
 255  255   * This constant specifies the number of seconds that threads waiting for
 256  256   * subsystems to release a zone's general-purpose references will wait before
 257  257   * they log the zone's reference counts.  The constant's value shouldn't
 258  258   * be so small that reference counts are unnecessarily reported for zones
 259  259   * whose references are slowly released.  On the other hand, it shouldn't be so
 260  260   * large that users reboot their systems out of frustration over hung zones
 261  261   * before the system logs the zones' reference counts.
 262  262   */
 263  263  #define ZONE_DESTROY_TIMEOUT_SECS       60
 264  264  
 265  265  /* List of data link IDs which are accessible from the zone */
 266  266  typedef struct zone_dl {
 267  267          datalink_id_t   zdl_id;
 268  268          nvlist_t        *zdl_net;
 269  269          list_node_t     zdl_linkage;
 270  270  } zone_dl_t;
 271  271  
 272  272  /*
 273  273   * cv used to signal that all references to the zone have been released.  This
 274  274   * needs to be global since there may be multiple waiters, and the first to
 275  275   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  276   */
 277  277  static kcondvar_t zone_destroy_cv;
 278  278  /*
 279  279   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  280   * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  281   */
 282  282  static kmutex_t zone_status_lock;
 283  283  
 284  284  /*
 285  285   * ZSD-related global variables.
 286  286   */
 287  287  static kmutex_t zsd_key_lock;   /* protects the following two */
 288  288  /*
 289  289   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  290   */
 291  291  static zone_key_t zsd_keyval = 0;
 292  292  /*
 293  293   * Global list of registered keys.  We use this when a new zone is created.
 294  294   */
 295  295  static list_t zsd_registered_keys;
 296  296  
 297  297  int zone_hash_size = 256;
 298  298  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299  299  static kmutex_t zonehash_lock;
 300  300  static uint_t zonecount;
 301  301  static id_space_t *zoneid_space;
 302  302  
 303  303  /*
 304  304   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  305   * kernel proper runs, and which manages all other zones.
 306  306   *
 307  307   * Although not declared as static, the variable "zone0" should not be used
 308  308   * except for by code that needs to reference the global zone early on in boot,
 309  309   * before it is fully initialized.  All other consumers should use
 310  310   * 'global_zone'.
 311  311   */
 312  312  zone_t zone0;
 313  313  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314  314  
 315  315  /*
 316  316   * List of active zones, protected by zonehash_lock.
 317  317   */
 318  318  static list_t zone_active;
 319  319  
 320  320  /*
 321  321   * List of destroyed zones that still have outstanding cred references.
 322  322   * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  323   * problems in zone_free.
 324  324   */
 325  325  static list_t zone_deathrow;
 326  326  static kmutex_t zone_deathrow_lock;
 327  327  
 328  328  /* number of zones is limited by virtual interface limit in IP */
 329  329  uint_t maxzones = 8192;
 330  330  
 331  331  /* Event channel to sent zone state change notifications */
 332  332  evchan_t *zone_event_chan;
 333  333  
 334  334  /*
 335  335   * This table holds the mapping from kernel zone states to
 336  336   * states visible in the state notification API.
 337  337   * The idea is that we only expose "obvious" states and
 338  338   * do not expose states which are just implementation details.
 339  339   */
 340  340  const char  *zone_status_table[] = {
 341  341          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342  342          ZONE_EVENT_INITIALIZED,         /* initialized */
 343  343          ZONE_EVENT_READY,               /* ready */
 344  344          ZONE_EVENT_READY,               /* booting */
 345  345          ZONE_EVENT_RUNNING,             /* running */
 346  346          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347  347          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350  350          ZONE_EVENT_UNINITIALIZED,       /* dead */
 351  351  };
 352  352  
 353  353  /*
 354  354   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  355   * (see sys/zone.h).
 356  356   */
 357  357  static char *zone_ref_subsys_names[] = {
 358  358          "NFS",          /* ZONE_REF_NFS */
 359  359          "NFSv4",        /* ZONE_REF_NFSV4 */
 360  360          "SMBFS",        /* ZONE_REF_SMBFS */
 361  361          "MNTFS",        /* ZONE_REF_MNTFS */
 362  362          "LOFI",         /* ZONE_REF_LOFI */
 363  363          "VFS",          /* ZONE_REF_VFS */
 364  364          "IPC"           /* ZONE_REF_IPC */
 365  365  };
 366  366  
 367  367  /*
 368  368   * This isn't static so lint doesn't complain.
 369  369   */
 370  370  rctl_hndl_t rc_zone_cpu_shares;
 371  371  rctl_hndl_t rc_zone_locked_mem;
 372  372  rctl_hndl_t rc_zone_max_swap;
 373  373  rctl_hndl_t rc_zone_max_lofi;
 374  374  rctl_hndl_t rc_zone_cpu_cap;
 375  375  rctl_hndl_t rc_zone_nlwps;
 376  376  rctl_hndl_t rc_zone_nprocs;
 377  377  rctl_hndl_t rc_zone_shmmax;
 378  378  rctl_hndl_t rc_zone_shmmni;
 379  379  rctl_hndl_t rc_zone_semmni;
 380  380  rctl_hndl_t rc_zone_msgmni;
 381  381  
 382  382  const char * const zone_default_initname = "/sbin/init";
 383  383  static char * const zone_prefix = "/zone/";
 384  384  static int zone_shutdown(zoneid_t zoneid);
 385  385  static int zone_add_datalink(zoneid_t, datalink_id_t);
 386  386  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 387  387  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 388  388  static int zone_set_network(zoneid_t, zone_net_data_t *);
 389  389  static int zone_get_network(zoneid_t, zone_net_data_t *);
 390  390  
 391  391  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 392  392  
 393  393  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 394  394  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 395  395  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396  396  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 397  397      zone_key_t);
 398  398  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 399  399  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 400  400      kmutex_t *);
 401  401  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  
 404  404  /*
 405  405   * Bump this number when you alter the zone syscall interfaces; this is
 406  406   * because we need to have support for previous API versions in libc
 407  407   * to support patching; libc calls into the kernel to determine this number.
 408  408   *
 409  409   * Version 1 of the API is the version originally shipped with Solaris 10
 410  410   * Version 2 alters the zone_create system call in order to support more
 411  411   *     arguments by moving the args into a structure; and to do better
 412  412   *     error reporting when zone_create() fails.
 413  413   * Version 3 alters the zone_create system call in order to support the
 414  414   *     import of ZFS datasets to zones.
 415  415   * Version 4 alters the zone_create system call in order to support
 416  416   *     Trusted Extensions.
 417  417   * Version 5 alters the zone_boot system call, and converts its old
 418  418   *     bootargs parameter to be set by the zone_setattr API instead.
 419  419   * Version 6 adds the flag argument to zone_create.
 420  420   */
 421  421  static const int ZONE_SYSCALL_API_VERSION = 6;
 422  422  
 423  423  /*
 424  424   * Certain filesystems (such as NFS and autofs) need to know which zone
 425  425   * the mount is being placed in.  Because of this, we need to be able to
 426  426   * ensure that a zone isn't in the process of being created/destroyed such
 427  427   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 428  428   * it gets added the list of mounted zones, it ends up on the wrong zone's
 429  429   * mount list. Since a zone can't reside on an NFS file system, we don't
 430  430   * have to worry about the zonepath itself.
 431  431   *
 432  432   * The following functions: block_mounts()/resume_mounts() and
 433  433   * mount_in_progress()/mount_completed() are used by zones and the VFS
 434  434   * layer (respectively) to synchronize zone state transitions and new
 435  435   * mounts within a zone. This syncronization is on a per-zone basis, so
 436  436   * activity for one zone will not interfere with activity for another zone.
 437  437   *
 438  438   * The semantics are like a reader-reader lock such that there may
 439  439   * either be multiple mounts (or zone state transitions, if that weren't
 440  440   * serialized by zonehash_lock) in progress at the same time, but not
 441  441   * both.
 442  442   *
 443  443   * We use cv's so the user can ctrl-C out of the operation if it's
 444  444   * taking too long.
 445  445   *
 446  446   * The semantics are such that there is unfair bias towards the
 447  447   * "current" operation.  This means that zone halt may starve if
 448  448   * there is a rapid succession of new mounts coming in to the zone.
 449  449   */
 450  450  /*
 451  451   * Prevent new mounts from progressing to the point of calling
 452  452   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 453  453   * them to complete.
 454  454   */
 455  455  static int
 456  456  block_mounts(zone_t *zp)
 457  457  {
 458  458          int retval = 0;
 459  459  
 460  460          /*
 461  461           * Since it may block for a long time, block_mounts() shouldn't be
 462  462           * called with zonehash_lock held.
 463  463           */
 464  464          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 465  465          mutex_enter(&zp->zone_mount_lock);
 466  466          while (zp->zone_mounts_in_progress > 0) {
 467  467                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 468  468                          goto signaled;
 469  469          }
 470  470          /*
 471  471           * A negative value of mounts_in_progress indicates that mounts
 472  472           * have been blocked by (-mounts_in_progress) different callers
 473  473           * (remotely possible if two threads enter zone_shutdown at the same
 474  474           * time).
 475  475           */
 476  476          zp->zone_mounts_in_progress--;
 477  477          retval = 1;
 478  478  signaled:
 479  479          mutex_exit(&zp->zone_mount_lock);
 480  480          return (retval);
 481  481  }
 482  482  
 483  483  /*
 484  484   * The VFS layer may progress with new mounts as far as we're concerned.
 485  485   * Allow them to progress if we were the last obstacle.
 486  486   */
 487  487  static void
 488  488  resume_mounts(zone_t *zp)
 489  489  {
 490  490          mutex_enter(&zp->zone_mount_lock);
 491  491          if (++zp->zone_mounts_in_progress == 0)
 492  492                  cv_broadcast(&zp->zone_mount_cv);
 493  493          mutex_exit(&zp->zone_mount_lock);
 494  494  }
 495  495  
 496  496  /*
 497  497   * The VFS layer is busy with a mount; this zone should wait until all
 498  498   * of its mounts are completed to progress.
 499  499   */
 500  500  void
 501  501  mount_in_progress(zone_t *zp)
 502  502  {
 503  503          mutex_enter(&zp->zone_mount_lock);
 504  504          while (zp->zone_mounts_in_progress < 0)
 505  505                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 506  506          zp->zone_mounts_in_progress++;
 507  507          mutex_exit(&zp->zone_mount_lock);
 508  508  }
 509  509  
 510  510  /*
 511  511   * VFS is done with one mount; wake up any waiting block_mounts()
 512  512   * callers if this is the last mount.
 513  513   */
 514  514  void
 515  515  mount_completed(zone_t *zp)
 516  516  {
 517  517          mutex_enter(&zp->zone_mount_lock);
 518  518          if (--zp->zone_mounts_in_progress == 0)
 519  519                  cv_broadcast(&zp->zone_mount_cv);
 520  520          mutex_exit(&zp->zone_mount_lock);
 521  521  }
 522  522  
 523  523  /*
 524  524   * ZSD routines.
 525  525   *
 526  526   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 527  527   * defined by the pthread_key_create() and related interfaces.
 528  528   *
 529  529   * Kernel subsystems may register one or more data items and/or
 530  530   * callbacks to be executed when a zone is created, shutdown, or
 531  531   * destroyed.
 532  532   *
 533  533   * Unlike the thread counterpart, destructor callbacks will be executed
 534  534   * even if the data pointer is NULL and/or there are no constructor
 535  535   * callbacks, so it is the responsibility of such callbacks to check for
 536  536   * NULL data values if necessary.
 537  537   *
 538  538   * The locking strategy and overall picture is as follows:
 539  539   *
 540  540   * When someone calls zone_key_create(), a template ZSD entry is added to the
 541  541   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 542  542   * holding that lock all the existing zones are marked as
 543  543   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 544  544   * zone_zsd list (protected by zone_lock). The global list is updated first
 545  545   * (under zone_key_lock) to make sure that newly created zones use the
 546  546   * most recent list of keys. Then under zonehash_lock we walk the zones
 547  547   * and mark them.  Similar locking is used in zone_key_delete().
 548  548   *
 549  549   * The actual create, shutdown, and destroy callbacks are done without
 550  550   * holding any lock. And zsd_flags are used to ensure that the operations
 551  551   * completed so that when zone_key_create (and zone_create) is done, as well as
 552  552   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 553  553   * are completed.
 554  554   *
 555  555   * When new zones are created constructor callbacks for all registered ZSD
 556  556   * entries will be called. That also uses the above two phases of marking
 557  557   * what needs to be done, and then running the callbacks without holding
 558  558   * any locks.
 559  559   *
 560  560   * The framework does not provide any locking around zone_getspecific() and
 561  561   * zone_setspecific() apart from that needed for internal consistency, so
 562  562   * callers interested in atomic "test-and-set" semantics will need to provide
 563  563   * their own locking.
 564  564   */
 565  565  
 566  566  /*
 567  567   * Helper function to find the zsd_entry associated with the key in the
 568  568   * given list.
 569  569   */
 570  570  static struct zsd_entry *
 571  571  zsd_find(list_t *l, zone_key_t key)
 572  572  {
 573  573          struct zsd_entry *zsd;
 574  574  
 575  575          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 576  576                  if (zsd->zsd_key == key) {
 577  577                          return (zsd);
 578  578                  }
 579  579          }
 580  580          return (NULL);
 581  581  }
 582  582  
 583  583  /*
 584  584   * Helper function to find the zsd_entry associated with the key in the
 585  585   * given list. Move it to the front of the list.
 586  586   */
 587  587  static struct zsd_entry *
 588  588  zsd_find_mru(list_t *l, zone_key_t key)
 589  589  {
 590  590          struct zsd_entry *zsd;
 591  591  
 592  592          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 593  593                  if (zsd->zsd_key == key) {
 594  594                          /*
 595  595                           * Move to head of list to keep list in MRU order.
 596  596                           */
 597  597                          if (zsd != list_head(l)) {
 598  598                                  list_remove(l, zsd);
 599  599                                  list_insert_head(l, zsd);
 600  600                          }
 601  601                          return (zsd);
 602  602                  }
 603  603          }
 604  604          return (NULL);
 605  605  }
 606  606  
 607  607  void
 608  608  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 609  609      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 610  610  {
 611  611          struct zsd_entry *zsdp;
 612  612          struct zsd_entry *t;
 613  613          struct zone *zone;
 614  614          zone_key_t  key;
 615  615  
 616  616          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 617  617          zsdp->zsd_data = NULL;
 618  618          zsdp->zsd_create = create;
 619  619          zsdp->zsd_shutdown = shutdown;
 620  620          zsdp->zsd_destroy = destroy;
 621  621  
 622  622          /*
 623  623           * Insert in global list of callbacks. Makes future zone creations
 624  624           * see it.
 625  625           */
 626  626          mutex_enter(&zsd_key_lock);
 627  627          key = zsdp->zsd_key = ++zsd_keyval;
 628  628          ASSERT(zsd_keyval != 0);
 629  629          list_insert_tail(&zsd_registered_keys, zsdp);
 630  630          mutex_exit(&zsd_key_lock);
 631  631  
 632  632          /*
 633  633           * Insert for all existing zones and mark them as needing
 634  634           * a create callback.
 635  635           */
 636  636          mutex_enter(&zonehash_lock);    /* stop the world */
 637  637          for (zone = list_head(&zone_active); zone != NULL;
 638  638              zone = list_next(&zone_active, zone)) {
 639  639                  zone_status_t status;
 640  640  
 641  641                  mutex_enter(&zone->zone_lock);
 642  642  
 643  643                  /* Skip zones that are on the way down or not yet up */
 644  644                  status = zone_status_get(zone);
 645  645                  if (status >= ZONE_IS_DOWN ||
 646  646                      status == ZONE_IS_UNINITIALIZED) {
 647  647                          mutex_exit(&zone->zone_lock);
 648  648                          continue;
 649  649                  }
 650  650  
 651  651                  t = zsd_find_mru(&zone->zone_zsd, key);
 652  652                  if (t != NULL) {
 653  653                          /*
 654  654                           * A zsd_configure already inserted it after
 655  655                           * we dropped zsd_key_lock above.
 656  656                           */
 657  657                          mutex_exit(&zone->zone_lock);
 658  658                          continue;
 659  659                  }
 660  660                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 661  661                  t->zsd_key = key;
 662  662                  t->zsd_create = create;
 663  663                  t->zsd_shutdown = shutdown;
 664  664                  t->zsd_destroy = destroy;
 665  665                  if (create != NULL) {
 666  666                          t->zsd_flags = ZSD_CREATE_NEEDED;
 667  667                          DTRACE_PROBE2(zsd__create__needed,
 668  668                              zone_t *, zone, zone_key_t, key);
 669  669                  }
 670  670                  list_insert_tail(&zone->zone_zsd, t);
 671  671                  mutex_exit(&zone->zone_lock);
 672  672          }
 673  673          mutex_exit(&zonehash_lock);
 674  674  
 675  675          if (create != NULL) {
 676  676                  /* Now call the create callback for this key */
 677  677                  zsd_apply_all_zones(zsd_apply_create, key);
 678  678          }
 679  679          /*
 680  680           * It is safe for consumers to use the key now, make it
 681  681           * globally visible. Specifically zone_getspecific() will
 682  682           * always successfully return the zone specific data associated
 683  683           * with the key.
 684  684           */
 685  685          *keyp = key;
 686  686  
 687  687  }
 688  688  
 689  689  /*
 690  690   * Function called when a module is being unloaded, or otherwise wishes
 691  691   * to unregister its ZSD key and callbacks.
 692  692   *
 693  693   * Remove from the global list and determine the functions that need to
 694  694   * be called under a global lock. Then call the functions without
 695  695   * holding any locks. Finally free up the zone_zsd entries. (The apply
 696  696   * functions need to access the zone_zsd entries to find zsd_data etc.)
 697  697   */
 698  698  int
 699  699  zone_key_delete(zone_key_t key)
 700  700  {
 701  701          struct zsd_entry *zsdp = NULL;
 702  702          zone_t *zone;
 703  703  
 704  704          mutex_enter(&zsd_key_lock);
 705  705          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 706  706          if (zsdp == NULL) {
 707  707                  mutex_exit(&zsd_key_lock);
 708  708                  return (-1);
 709  709          }
 710  710          list_remove(&zsd_registered_keys, zsdp);
 711  711          mutex_exit(&zsd_key_lock);
 712  712  
 713  713          mutex_enter(&zonehash_lock);
 714  714          for (zone = list_head(&zone_active); zone != NULL;
 715  715              zone = list_next(&zone_active, zone)) {
 716  716                  struct zsd_entry *del;
 717  717  
 718  718                  mutex_enter(&zone->zone_lock);
 719  719                  del = zsd_find_mru(&zone->zone_zsd, key);
 720  720                  if (del == NULL) {
 721  721                          /*
 722  722                           * Somebody else got here first e.g the zone going
 723  723                           * away.
 724  724                           */
 725  725                          mutex_exit(&zone->zone_lock);
 726  726                          continue;
 727  727                  }
 728  728                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 729  729                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 730  730                  if (del->zsd_shutdown != NULL &&
 731  731                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 732  732                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 733  733                          DTRACE_PROBE2(zsd__shutdown__needed,
 734  734                              zone_t *, zone, zone_key_t, key);
 735  735                  }
 736  736                  if (del->zsd_destroy != NULL &&
 737  737                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 738  738                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 739  739                          DTRACE_PROBE2(zsd__destroy__needed,
 740  740                              zone_t *, zone, zone_key_t, key);
 741  741                  }
 742  742                  mutex_exit(&zone->zone_lock);
 743  743          }
 744  744          mutex_exit(&zonehash_lock);
 745  745          kmem_free(zsdp, sizeof (*zsdp));
 746  746  
 747  747          /* Now call the shutdown and destroy callback for this key */
 748  748          zsd_apply_all_zones(zsd_apply_shutdown, key);
 749  749          zsd_apply_all_zones(zsd_apply_destroy, key);
 750  750  
 751  751          /* Now we can free up the zsdp structures in each zone */
 752  752          mutex_enter(&zonehash_lock);
 753  753          for (zone = list_head(&zone_active); zone != NULL;
 754  754              zone = list_next(&zone_active, zone)) {
 755  755                  struct zsd_entry *del;
 756  756  
 757  757                  mutex_enter(&zone->zone_lock);
 758  758                  del = zsd_find(&zone->zone_zsd, key);
 759  759                  if (del != NULL) {
 760  760                          list_remove(&zone->zone_zsd, del);
 761  761                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 762  762                          kmem_free(del, sizeof (*del));
 763  763                  }
 764  764                  mutex_exit(&zone->zone_lock);
 765  765          }
 766  766          mutex_exit(&zonehash_lock);
 767  767  
 768  768          return (0);
 769  769  }
 770  770  
 771  771  /*
 772  772   * ZSD counterpart of pthread_setspecific().
 773  773   *
 774  774   * Since all zsd callbacks, including those with no create function,
 775  775   * have an entry in zone_zsd, if the key is registered it is part of
 776  776   * the zone_zsd list.
 777  777   * Return an error if the key wasn't registerd.
 778  778   */
 779  779  int
 780  780  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 781  781  {
 782  782          struct zsd_entry *t;
 783  783  
 784  784          mutex_enter(&zone->zone_lock);
 785  785          t = zsd_find_mru(&zone->zone_zsd, key);
 786  786          if (t != NULL) {
 787  787                  /*
 788  788                   * Replace old value with new
 789  789                   */
 790  790                  t->zsd_data = (void *)data;
 791  791                  mutex_exit(&zone->zone_lock);
 792  792                  return (0);
 793  793          }
 794  794          mutex_exit(&zone->zone_lock);
 795  795          return (-1);
 796  796  }
 797  797  
 798  798  /*
 799  799   * ZSD counterpart of pthread_getspecific().
 800  800   */
 801  801  void *
 802  802  zone_getspecific(zone_key_t key, zone_t *zone)
 803  803  {
 804  804          struct zsd_entry *t;
 805  805          void *data;
 806  806  
 807  807          mutex_enter(&zone->zone_lock);
 808  808          t = zsd_find_mru(&zone->zone_zsd, key);
 809  809          data = (t == NULL ? NULL : t->zsd_data);
 810  810          mutex_exit(&zone->zone_lock);
 811  811          return (data);
 812  812  }
 813  813  
 814  814  /*
 815  815   * Function used to initialize a zone's list of ZSD callbacks and data
 816  816   * when the zone is being created.  The callbacks are initialized from
 817  817   * the template list (zsd_registered_keys). The constructor callback is
 818  818   * executed later (once the zone exists and with locks dropped).
 819  819   */
 820  820  static void
 821  821  zone_zsd_configure(zone_t *zone)
 822  822  {
 823  823          struct zsd_entry *zsdp;
 824  824          struct zsd_entry *t;
 825  825  
 826  826          ASSERT(MUTEX_HELD(&zonehash_lock));
 827  827          ASSERT(list_head(&zone->zone_zsd) == NULL);
 828  828          mutex_enter(&zone->zone_lock);
 829  829          mutex_enter(&zsd_key_lock);
 830  830          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 831  831              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 832  832                  /*
 833  833                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 834  834                   * should not have added anything to it.
 835  835                   */
 836  836                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 837  837  
 838  838                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 839  839                  t->zsd_key = zsdp->zsd_key;
 840  840                  t->zsd_create = zsdp->zsd_create;
 841  841                  t->zsd_shutdown = zsdp->zsd_shutdown;
 842  842                  t->zsd_destroy = zsdp->zsd_destroy;
 843  843                  if (zsdp->zsd_create != NULL) {
 844  844                          t->zsd_flags = ZSD_CREATE_NEEDED;
 845  845                          DTRACE_PROBE2(zsd__create__needed,
 846  846                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 847  847                  }
 848  848                  list_insert_tail(&zone->zone_zsd, t);
 849  849          }
 850  850          mutex_exit(&zsd_key_lock);
 851  851          mutex_exit(&zone->zone_lock);
 852  852  }
 853  853  
 854  854  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 855  855  
 856  856  /*
 857  857   * Helper function to execute shutdown or destructor callbacks.
 858  858   */
 859  859  static void
 860  860  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 861  861  {
 862  862          struct zsd_entry *t;
 863  863  
 864  864          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 865  865          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 866  866          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 867  867  
 868  868          /*
 869  869           * Run the callback solely based on what is registered for the zone
 870  870           * in zone_zsd. The global list can change independently of this
 871  871           * as keys are registered and unregistered and we don't register new
 872  872           * callbacks for a zone that is in the process of going away.
 873  873           */
 874  874          mutex_enter(&zone->zone_lock);
 875  875          for (t = list_head(&zone->zone_zsd); t != NULL;
 876  876              t = list_next(&zone->zone_zsd, t)) {
 877  877                  zone_key_t key = t->zsd_key;
 878  878  
 879  879                  /* Skip if no callbacks registered */
 880  880  
 881  881                  if (ct == ZSD_SHUTDOWN) {
 882  882                          if (t->zsd_shutdown != NULL &&
 883  883                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 884  884                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 885  885                                  DTRACE_PROBE2(zsd__shutdown__needed,
 886  886                                      zone_t *, zone, zone_key_t, key);
 887  887                          }
 888  888                  } else {
 889  889                          if (t->zsd_destroy != NULL &&
 890  890                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 891  891                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 892  892                                  DTRACE_PROBE2(zsd__destroy__needed,
 893  893                                      zone_t *, zone, zone_key_t, key);
 894  894                          }
 895  895                  }
 896  896          }
 897  897          mutex_exit(&zone->zone_lock);
 898  898  
 899  899          /* Now call the shutdown and destroy callback for this key */
 900  900          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 901  901          zsd_apply_all_keys(zsd_apply_destroy, zone);
 902  902  
 903  903  }
 904  904  
 905  905  /*
 906  906   * Called when the zone is going away; free ZSD-related memory, and
 907  907   * destroy the zone_zsd list.
 908  908   */
 909  909  static void
 910  910  zone_free_zsd(zone_t *zone)
 911  911  {
 912  912          struct zsd_entry *t, *next;
 913  913  
 914  914          /*
 915  915           * Free all the zsd_entry's we had on this zone.
 916  916           */
 917  917          mutex_enter(&zone->zone_lock);
 918  918          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 919  919                  next = list_next(&zone->zone_zsd, t);
 920  920                  list_remove(&zone->zone_zsd, t);
 921  921                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 922  922                  kmem_free(t, sizeof (*t));
 923  923          }
 924  924          list_destroy(&zone->zone_zsd);
 925  925          mutex_exit(&zone->zone_lock);
 926  926  
 927  927  }
 928  928  
 929  929  /*
 930  930   * Apply a function to all zones for particular key value.
 931  931   *
 932  932   * The applyfn has to drop zonehash_lock if it does some work, and
 933  933   * then reacquire it before it returns.
 934  934   * When the lock is dropped we don't follow list_next even
 935  935   * if it is possible to do so without any hazards. This is
 936  936   * because we want the design to allow for the list of zones
 937  937   * to change in any arbitrary way during the time the
 938  938   * lock was dropped.
 939  939   *
 940  940   * It is safe to restart the loop at list_head since the applyfn
 941  941   * changes the zsd_flags as it does work, so a subsequent
 942  942   * pass through will have no effect in applyfn, hence the loop will terminate
 943  943   * in at worst O(N^2).
 944  944   */
 945  945  static void
 946  946  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 947  947  {
 948  948          zone_t *zone;
 949  949  
 950  950          mutex_enter(&zonehash_lock);
 951  951          zone = list_head(&zone_active);
 952  952          while (zone != NULL) {
 953  953                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 954  954                          /* Lock dropped - restart at head */
 955  955                          zone = list_head(&zone_active);
 956  956                  } else {
 957  957                          zone = list_next(&zone_active, zone);
 958  958                  }
 959  959          }
 960  960          mutex_exit(&zonehash_lock);
 961  961  }
 962  962  
 963  963  /*
 964  964   * Apply a function to all keys for a particular zone.
 965  965   *
 966  966   * The applyfn has to drop zonehash_lock if it does some work, and
 967  967   * then reacquire it before it returns.
 968  968   * When the lock is dropped we don't follow list_next even
 969  969   * if it is possible to do so without any hazards. This is
 970  970   * because we want the design to allow for the list of zsd callbacks
 971  971   * to change in any arbitrary way during the time the
 972  972   * lock was dropped.
 973  973   *
 974  974   * It is safe to restart the loop at list_head since the applyfn
 975  975   * changes the zsd_flags as it does work, so a subsequent
 976  976   * pass through will have no effect in applyfn, hence the loop will terminate
 977  977   * in at worst O(N^2).
 978  978   */
 979  979  static void
 980  980  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 981  981  {
 982  982          struct zsd_entry *t;
 983  983  
 984  984          mutex_enter(&zone->zone_lock);
 985  985          t = list_head(&zone->zone_zsd);
 986  986          while (t != NULL) {
 987  987                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 988  988                          /* Lock dropped - restart at head */
 989  989                          t = list_head(&zone->zone_zsd);
 990  990                  } else {
 991  991                          t = list_next(&zone->zone_zsd, t);
 992  992                  }
 993  993          }
 994  994          mutex_exit(&zone->zone_lock);
 995  995  }
 996  996  
 997  997  /*
 998  998   * Call the create function for the zone and key if CREATE_NEEDED
 999  999   * is set.
1000 1000   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1001 1001   * we wait for that thread to complete so that we can ensure that
1002 1002   * all the callbacks are done when we've looped over all zones/keys.
1003 1003   *
1004 1004   * When we call the create function, we drop the global held by the
1005 1005   * caller, and return true to tell the caller it needs to re-evalute the
1006 1006   * state.
1007 1007   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1008 1008   * remains held on exit.
1009 1009   */
1010 1010  static boolean_t
1011 1011  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1012 1012      zone_t *zone, zone_key_t key)
1013 1013  {
1014 1014          void *result;
1015 1015          struct zsd_entry *t;
1016 1016          boolean_t dropped;
1017 1017  
1018 1018          if (lockp != NULL) {
1019 1019                  ASSERT(MUTEX_HELD(lockp));
1020 1020          }
1021 1021          if (zone_lock_held) {
1022 1022                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1023 1023          } else {
1024 1024                  mutex_enter(&zone->zone_lock);
1025 1025          }
1026 1026  
1027 1027          t = zsd_find(&zone->zone_zsd, key);
1028 1028          if (t == NULL) {
1029 1029                  /*
1030 1030                   * Somebody else got here first e.g the zone going
1031 1031                   * away.
1032 1032                   */
1033 1033                  if (!zone_lock_held)
1034 1034                          mutex_exit(&zone->zone_lock);
1035 1035                  return (B_FALSE);
1036 1036          }
1037 1037          dropped = B_FALSE;
1038 1038          if (zsd_wait_for_inprogress(zone, t, lockp))
1039 1039                  dropped = B_TRUE;
1040 1040  
1041 1041          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1042 1042                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1043 1043                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1044 1044                  DTRACE_PROBE2(zsd__create__inprogress,
1045 1045                      zone_t *, zone, zone_key_t, key);
1046 1046                  mutex_exit(&zone->zone_lock);
1047 1047                  if (lockp != NULL)
1048 1048                          mutex_exit(lockp);
1049 1049  
1050 1050                  dropped = B_TRUE;
1051 1051                  ASSERT(t->zsd_create != NULL);
1052 1052                  DTRACE_PROBE2(zsd__create__start,
1053 1053                      zone_t *, zone, zone_key_t, key);
1054 1054  
1055 1055                  result = (*t->zsd_create)(zone->zone_id);
1056 1056  
1057 1057                  DTRACE_PROBE2(zsd__create__end,
1058 1058                      zone_t *, zone, voidn *, result);
1059 1059  
1060 1060                  ASSERT(result != NULL);
1061 1061                  if (lockp != NULL)
1062 1062                          mutex_enter(lockp);
1063 1063                  mutex_enter(&zone->zone_lock);
1064 1064                  t->zsd_data = result;
1065 1065                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1066 1066                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1067 1067                  cv_broadcast(&t->zsd_cv);
1068 1068                  DTRACE_PROBE2(zsd__create__completed,
1069 1069                      zone_t *, zone, zone_key_t, key);
1070 1070          }
1071 1071          if (!zone_lock_held)
1072 1072                  mutex_exit(&zone->zone_lock);
1073 1073          return (dropped);
1074 1074  }
1075 1075  
1076 1076  /*
1077 1077   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1078 1078   * is set.
1079 1079   * If some other thread gets here first and sets *_INPROGRESS, then
1080 1080   * we wait for that thread to complete so that we can ensure that
1081 1081   * all the callbacks are done when we've looped over all zones/keys.
1082 1082   *
1083 1083   * When we call the shutdown function, we drop the global held by the
1084 1084   * caller, and return true to tell the caller it needs to re-evalute the
1085 1085   * state.
1086 1086   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1087 1087   * remains held on exit.
1088 1088   */
1089 1089  static boolean_t
1090 1090  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1091 1091      zone_t *zone, zone_key_t key)
1092 1092  {
1093 1093          struct zsd_entry *t;
1094 1094          void *data;
1095 1095          boolean_t dropped;
1096 1096  
1097 1097          if (lockp != NULL) {
1098 1098                  ASSERT(MUTEX_HELD(lockp));
1099 1099          }
1100 1100          if (zone_lock_held) {
1101 1101                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1102 1102          } else {
1103 1103                  mutex_enter(&zone->zone_lock);
1104 1104          }
1105 1105  
1106 1106          t = zsd_find(&zone->zone_zsd, key);
1107 1107          if (t == NULL) {
1108 1108                  /*
1109 1109                   * Somebody else got here first e.g the zone going
1110 1110                   * away.
1111 1111                   */
1112 1112                  if (!zone_lock_held)
1113 1113                          mutex_exit(&zone->zone_lock);
1114 1114                  return (B_FALSE);
1115 1115          }
1116 1116          dropped = B_FALSE;
1117 1117          if (zsd_wait_for_creator(zone, t, lockp))
1118 1118                  dropped = B_TRUE;
1119 1119  
1120 1120          if (zsd_wait_for_inprogress(zone, t, lockp))
1121 1121                  dropped = B_TRUE;
1122 1122  
1123 1123          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1124 1124                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1125 1125                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1126 1126                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1127 1127                      zone_t *, zone, zone_key_t, key);
1128 1128                  mutex_exit(&zone->zone_lock);
1129 1129                  if (lockp != NULL)
1130 1130                          mutex_exit(lockp);
1131 1131                  dropped = B_TRUE;
1132 1132  
1133 1133                  ASSERT(t->zsd_shutdown != NULL);
1134 1134                  data = t->zsd_data;
1135 1135  
1136 1136                  DTRACE_PROBE2(zsd__shutdown__start,
1137 1137                      zone_t *, zone, zone_key_t, key);
1138 1138  
1139 1139                  (t->zsd_shutdown)(zone->zone_id, data);
1140 1140                  DTRACE_PROBE2(zsd__shutdown__end,
1141 1141                      zone_t *, zone, zone_key_t, key);
1142 1142  
1143 1143                  if (lockp != NULL)
1144 1144                          mutex_enter(lockp);
1145 1145                  mutex_enter(&zone->zone_lock);
1146 1146                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1147 1147                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1148 1148                  cv_broadcast(&t->zsd_cv);
1149 1149                  DTRACE_PROBE2(zsd__shutdown__completed,
1150 1150                      zone_t *, zone, zone_key_t, key);
1151 1151          }
1152 1152          if (!zone_lock_held)
1153 1153                  mutex_exit(&zone->zone_lock);
1154 1154          return (dropped);
1155 1155  }
1156 1156  
1157 1157  /*
1158 1158   * Call the destroy function for the zone and key if DESTROY_NEEDED
1159 1159   * is set.
1160 1160   * If some other thread gets here first and sets *_INPROGRESS, then
1161 1161   * we wait for that thread to complete so that we can ensure that
1162 1162   * all the callbacks are done when we've looped over all zones/keys.
1163 1163   *
1164 1164   * When we call the destroy function, we drop the global held by the
1165 1165   * caller, and return true to tell the caller it needs to re-evalute the
1166 1166   * state.
1167 1167   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1168 1168   * remains held on exit.
1169 1169   */
1170 1170  static boolean_t
1171 1171  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1172 1172      zone_t *zone, zone_key_t key)
1173 1173  {
1174 1174          struct zsd_entry *t;
1175 1175          void *data;
1176 1176          boolean_t dropped;
1177 1177  
1178 1178          if (lockp != NULL) {
1179 1179                  ASSERT(MUTEX_HELD(lockp));
1180 1180          }
1181 1181          if (zone_lock_held) {
1182 1182                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1183 1183          } else {
1184 1184                  mutex_enter(&zone->zone_lock);
1185 1185          }
1186 1186  
1187 1187          t = zsd_find(&zone->zone_zsd, key);
1188 1188          if (t == NULL) {
1189 1189                  /*
1190 1190                   * Somebody else got here first e.g the zone going
1191 1191                   * away.
1192 1192                   */
1193 1193                  if (!zone_lock_held)
1194 1194                          mutex_exit(&zone->zone_lock);
1195 1195                  return (B_FALSE);
1196 1196          }
1197 1197          dropped = B_FALSE;
1198 1198          if (zsd_wait_for_creator(zone, t, lockp))
1199 1199                  dropped = B_TRUE;
1200 1200  
1201 1201          if (zsd_wait_for_inprogress(zone, t, lockp))
1202 1202                  dropped = B_TRUE;
1203 1203  
1204 1204          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1205 1205                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1206 1206                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1207 1207                  DTRACE_PROBE2(zsd__destroy__inprogress,
1208 1208                      zone_t *, zone, zone_key_t, key);
1209 1209                  mutex_exit(&zone->zone_lock);
1210 1210                  if (lockp != NULL)
1211 1211                          mutex_exit(lockp);
1212 1212                  dropped = B_TRUE;
1213 1213  
1214 1214                  ASSERT(t->zsd_destroy != NULL);
1215 1215                  data = t->zsd_data;
1216 1216                  DTRACE_PROBE2(zsd__destroy__start,
1217 1217                      zone_t *, zone, zone_key_t, key);
1218 1218  
1219 1219                  (t->zsd_destroy)(zone->zone_id, data);
1220 1220                  DTRACE_PROBE2(zsd__destroy__end,
1221 1221                      zone_t *, zone, zone_key_t, key);
1222 1222  
1223 1223                  if (lockp != NULL)
1224 1224                          mutex_enter(lockp);
1225 1225                  mutex_enter(&zone->zone_lock);
1226 1226                  t->zsd_data = NULL;
1227 1227                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1228 1228                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1229 1229                  cv_broadcast(&t->zsd_cv);
1230 1230                  DTRACE_PROBE2(zsd__destroy__completed,
1231 1231                      zone_t *, zone, zone_key_t, key);
1232 1232          }
1233 1233          if (!zone_lock_held)
1234 1234                  mutex_exit(&zone->zone_lock);
1235 1235          return (dropped);
1236 1236  }
1237 1237  
1238 1238  /*
1239 1239   * Wait for any CREATE_NEEDED flag to be cleared.
1240 1240   * Returns true if lockp was temporarily dropped while waiting.
1241 1241   */
1242 1242  static boolean_t
1243 1243  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1244 1244  {
1245 1245          boolean_t dropped = B_FALSE;
1246 1246  
1247 1247          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1248 1248                  DTRACE_PROBE2(zsd__wait__for__creator,
1249 1249                      zone_t *, zone, struct zsd_entry *, t);
1250 1250                  if (lockp != NULL) {
1251 1251                          dropped = B_TRUE;
1252 1252                          mutex_exit(lockp);
1253 1253                  }
1254 1254                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1255 1255                  if (lockp != NULL) {
1256 1256                          /* First drop zone_lock to preserve order */
1257 1257                          mutex_exit(&zone->zone_lock);
1258 1258                          mutex_enter(lockp);
1259 1259                          mutex_enter(&zone->zone_lock);
1260 1260                  }
1261 1261          }
1262 1262          return (dropped);
1263 1263  }
1264 1264  
1265 1265  /*
1266 1266   * Wait for any INPROGRESS flag to be cleared.
1267 1267   * Returns true if lockp was temporarily dropped while waiting.
1268 1268   */
1269 1269  static boolean_t
1270 1270  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1271 1271  {
1272 1272          boolean_t dropped = B_FALSE;
1273 1273  
1274 1274          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1275 1275                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1276 1276                      zone_t *, zone, struct zsd_entry *, t);
1277 1277                  if (lockp != NULL) {
1278 1278                          dropped = B_TRUE;
1279 1279                          mutex_exit(lockp);
1280 1280                  }
1281 1281                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1282 1282                  if (lockp != NULL) {
1283 1283                          /* First drop zone_lock to preserve order */
1284 1284                          mutex_exit(&zone->zone_lock);
1285 1285                          mutex_enter(lockp);
1286 1286                          mutex_enter(&zone->zone_lock);
1287 1287                  }
1288 1288          }
1289 1289          return (dropped);
1290 1290  }
1291 1291  
1292 1292  /*
1293 1293   * Frees memory associated with the zone dataset list.
1294 1294   */
1295 1295  static void
1296 1296  zone_free_datasets(zone_t *zone)
1297 1297  {
1298 1298          zone_dataset_t *t, *next;
1299 1299  
1300 1300          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1301 1301                  next = list_next(&zone->zone_datasets, t);
1302 1302                  list_remove(&zone->zone_datasets, t);
1303 1303                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1304 1304                  kmem_free(t, sizeof (*t));
1305 1305          }
1306 1306          list_destroy(&zone->zone_datasets);
1307 1307  }
1308 1308  
1309 1309  /*
1310 1310   * zone.cpu-shares resource control support.
1311 1311   */
1312 1312  /*ARGSUSED*/
1313 1313  static rctl_qty_t
1314 1314  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1315 1315  {
1316 1316          ASSERT(MUTEX_HELD(&p->p_lock));
1317 1317          return (p->p_zone->zone_shares);
1318 1318  }
1319 1319  
1320 1320  /*ARGSUSED*/
1321 1321  static int
1322 1322  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1323 1323      rctl_qty_t nv)
1324 1324  {
1325 1325          ASSERT(MUTEX_HELD(&p->p_lock));
1326 1326          ASSERT(e->rcep_t == RCENTITY_ZONE);
1327 1327          if (e->rcep_p.zone == NULL)
1328 1328                  return (0);
1329 1329  
1330 1330          e->rcep_p.zone->zone_shares = nv;
1331 1331          return (0);
1332 1332  }
1333 1333  
1334 1334  static rctl_ops_t zone_cpu_shares_ops = {
1335 1335          rcop_no_action,
1336 1336          zone_cpu_shares_usage,
1337 1337          zone_cpu_shares_set,
1338 1338          rcop_no_test
1339 1339  };
1340 1340  
1341 1341  /*
1342 1342   * zone.cpu-cap resource control support.
1343 1343   */
1344 1344  /*ARGSUSED*/
1345 1345  static rctl_qty_t
1346 1346  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1347 1347  {
1348 1348          ASSERT(MUTEX_HELD(&p->p_lock));
1349 1349          return (cpucaps_zone_get(p->p_zone));
1350 1350  }
1351 1351  
1352 1352  /*ARGSUSED*/
1353 1353  static int
1354 1354  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1355 1355      rctl_qty_t nv)
1356 1356  {
1357 1357          zone_t *zone = e->rcep_p.zone;
1358 1358  
1359 1359          ASSERT(MUTEX_HELD(&p->p_lock));
1360 1360          ASSERT(e->rcep_t == RCENTITY_ZONE);
1361 1361  
1362 1362          if (zone == NULL)
1363 1363                  return (0);
1364 1364  
1365 1365          /*
1366 1366           * set cap to the new value.
1367 1367           */
1368 1368          return (cpucaps_zone_set(zone, nv));
1369 1369  }
1370 1370  
1371 1371  static rctl_ops_t zone_cpu_cap_ops = {
1372 1372          rcop_no_action,
1373 1373          zone_cpu_cap_get,
1374 1374          zone_cpu_cap_set,
1375 1375          rcop_no_test
1376 1376  };
1377 1377  
1378 1378  /*ARGSUSED*/
1379 1379  static rctl_qty_t
1380 1380  zone_lwps_usage(rctl_t *r, proc_t *p)
1381 1381  {
1382 1382          rctl_qty_t nlwps;
1383 1383          zone_t *zone = p->p_zone;
1384 1384  
1385 1385          ASSERT(MUTEX_HELD(&p->p_lock));
1386 1386  
1387 1387          mutex_enter(&zone->zone_nlwps_lock);
1388 1388          nlwps = zone->zone_nlwps;
1389 1389          mutex_exit(&zone->zone_nlwps_lock);
1390 1390  
1391 1391          return (nlwps);
1392 1392  }
1393 1393  
1394 1394  /*ARGSUSED*/
1395 1395  static int
1396 1396  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1397 1397      rctl_qty_t incr, uint_t flags)
1398 1398  {
1399 1399          rctl_qty_t nlwps;
1400 1400  
1401 1401          ASSERT(MUTEX_HELD(&p->p_lock));
1402 1402          ASSERT(e->rcep_t == RCENTITY_ZONE);
1403 1403          if (e->rcep_p.zone == NULL)
1404 1404                  return (0);
1405 1405          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1406 1406          nlwps = e->rcep_p.zone->zone_nlwps;
1407 1407  
1408 1408          if (nlwps + incr > rcntl->rcv_value)
1409 1409                  return (1);
1410 1410  
1411 1411          return (0);
1412 1412  }
1413 1413  
1414 1414  /*ARGSUSED*/
1415 1415  static int
1416 1416  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1417 1417  {
1418 1418          ASSERT(MUTEX_HELD(&p->p_lock));
1419 1419          ASSERT(e->rcep_t == RCENTITY_ZONE);
1420 1420          if (e->rcep_p.zone == NULL)
1421 1421                  return (0);
1422 1422          e->rcep_p.zone->zone_nlwps_ctl = nv;
1423 1423          return (0);
1424 1424  }
1425 1425  
1426 1426  static rctl_ops_t zone_lwps_ops = {
1427 1427          rcop_no_action,
1428 1428          zone_lwps_usage,
1429 1429          zone_lwps_set,
1430 1430          zone_lwps_test,
1431 1431  };
1432 1432  
1433 1433  /*ARGSUSED*/
1434 1434  static rctl_qty_t
1435 1435  zone_procs_usage(rctl_t *r, proc_t *p)
1436 1436  {
1437 1437          rctl_qty_t nprocs;
1438 1438          zone_t *zone = p->p_zone;
1439 1439  
1440 1440          ASSERT(MUTEX_HELD(&p->p_lock));
1441 1441  
1442 1442          mutex_enter(&zone->zone_nlwps_lock);
1443 1443          nprocs = zone->zone_nprocs;
1444 1444          mutex_exit(&zone->zone_nlwps_lock);
1445 1445  
1446 1446          return (nprocs);
1447 1447  }
1448 1448  
1449 1449  /*ARGSUSED*/
1450 1450  static int
1451 1451  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1452 1452      rctl_qty_t incr, uint_t flags)
1453 1453  {
1454 1454          rctl_qty_t nprocs;
1455 1455  
1456 1456          ASSERT(MUTEX_HELD(&p->p_lock));
1457 1457          ASSERT(e->rcep_t == RCENTITY_ZONE);
1458 1458          if (e->rcep_p.zone == NULL)
1459 1459                  return (0);
1460 1460          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1461 1461          nprocs = e->rcep_p.zone->zone_nprocs;
1462 1462  
1463 1463          if (nprocs + incr > rcntl->rcv_value)
1464 1464                  return (1);
1465 1465  
1466 1466          return (0);
1467 1467  }
1468 1468  
1469 1469  /*ARGSUSED*/
1470 1470  static int
1471 1471  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1472 1472  {
1473 1473          ASSERT(MUTEX_HELD(&p->p_lock));
1474 1474          ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 1475          if (e->rcep_p.zone == NULL)
1476 1476                  return (0);
1477 1477          e->rcep_p.zone->zone_nprocs_ctl = nv;
1478 1478          return (0);
1479 1479  }
1480 1480  
1481 1481  static rctl_ops_t zone_procs_ops = {
1482 1482          rcop_no_action,
1483 1483          zone_procs_usage,
1484 1484          zone_procs_set,
1485 1485          zone_procs_test,
1486 1486  };
1487 1487  
1488 1488  /*ARGSUSED*/
1489 1489  static rctl_qty_t
1490 1490  zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1491 1491  {
1492 1492          ASSERT(MUTEX_HELD(&p->p_lock));
1493 1493          return (p->p_zone->zone_shmmax);
1494 1494  }
1495 1495  
1496 1496  /*ARGSUSED*/
1497 1497  static int
1498 1498  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1499 1499      rctl_qty_t incr, uint_t flags)
1500 1500  {
1501 1501          rctl_qty_t v;
1502 1502          ASSERT(MUTEX_HELD(&p->p_lock));
1503 1503          ASSERT(e->rcep_t == RCENTITY_ZONE);
1504 1504          v = e->rcep_p.zone->zone_shmmax + incr;
1505 1505          if (v > rval->rcv_value)
1506 1506                  return (1);
1507 1507          return (0);
1508 1508  }
1509 1509  
1510 1510  static rctl_ops_t zone_shmmax_ops = {
1511 1511          rcop_no_action,
1512 1512          zone_shmmax_usage,
1513 1513          rcop_no_set,
1514 1514          zone_shmmax_test
1515 1515  };
1516 1516  
1517 1517  /*ARGSUSED*/
1518 1518  static rctl_qty_t
1519 1519  zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1520 1520  {
1521 1521          ASSERT(MUTEX_HELD(&p->p_lock));
1522 1522          return (p->p_zone->zone_ipc.ipcq_shmmni);
1523 1523  }
1524 1524  
1525 1525  /*ARGSUSED*/
1526 1526  static int
1527 1527  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1528 1528      rctl_qty_t incr, uint_t flags)
1529 1529  {
1530 1530          rctl_qty_t v;
1531 1531          ASSERT(MUTEX_HELD(&p->p_lock));
1532 1532          ASSERT(e->rcep_t == RCENTITY_ZONE);
1533 1533          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1534 1534          if (v > rval->rcv_value)
1535 1535                  return (1);
1536 1536          return (0);
1537 1537  }
1538 1538  
1539 1539  static rctl_ops_t zone_shmmni_ops = {
1540 1540          rcop_no_action,
1541 1541          zone_shmmni_usage,
1542 1542          rcop_no_set,
1543 1543          zone_shmmni_test
1544 1544  };
1545 1545  
1546 1546  /*ARGSUSED*/
1547 1547  static rctl_qty_t
1548 1548  zone_semmni_usage(rctl_t *rctl, struct proc *p)
1549 1549  {
1550 1550          ASSERT(MUTEX_HELD(&p->p_lock));
1551 1551          return (p->p_zone->zone_ipc.ipcq_semmni);
1552 1552  }
1553 1553  
1554 1554  /*ARGSUSED*/
1555 1555  static int
1556 1556  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1557 1557      rctl_qty_t incr, uint_t flags)
1558 1558  {
1559 1559          rctl_qty_t v;
1560 1560          ASSERT(MUTEX_HELD(&p->p_lock));
1561 1561          ASSERT(e->rcep_t == RCENTITY_ZONE);
1562 1562          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1563 1563          if (v > rval->rcv_value)
1564 1564                  return (1);
1565 1565          return (0);
1566 1566  }
1567 1567  
1568 1568  static rctl_ops_t zone_semmni_ops = {
1569 1569          rcop_no_action,
1570 1570          zone_semmni_usage,
1571 1571          rcop_no_set,
1572 1572          zone_semmni_test
1573 1573  };
1574 1574  
1575 1575  /*ARGSUSED*/
1576 1576  static rctl_qty_t
1577 1577  zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1578 1578  {
1579 1579          ASSERT(MUTEX_HELD(&p->p_lock));
1580 1580          return (p->p_zone->zone_ipc.ipcq_msgmni);
1581 1581  }
1582 1582  
1583 1583  /*ARGSUSED*/
1584 1584  static int
1585 1585  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1586 1586      rctl_qty_t incr, uint_t flags)
1587 1587  {
1588 1588          rctl_qty_t v;
1589 1589          ASSERT(MUTEX_HELD(&p->p_lock));
1590 1590          ASSERT(e->rcep_t == RCENTITY_ZONE);
1591 1591          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1592 1592          if (v > rval->rcv_value)
1593 1593                  return (1);
1594 1594          return (0);
1595 1595  }
1596 1596  
1597 1597  static rctl_ops_t zone_msgmni_ops = {
1598 1598          rcop_no_action,
1599 1599          zone_msgmni_usage,
1600 1600          rcop_no_set,
1601 1601          zone_msgmni_test
1602 1602  };
1603 1603  
1604 1604  /*ARGSUSED*/
1605 1605  static rctl_qty_t
1606 1606  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1607 1607  {
1608 1608          rctl_qty_t q;
1609 1609          ASSERT(MUTEX_HELD(&p->p_lock));
1610 1610          mutex_enter(&p->p_zone->zone_mem_lock);
1611 1611          q = p->p_zone->zone_locked_mem;
1612 1612          mutex_exit(&p->p_zone->zone_mem_lock);
1613 1613          return (q);
1614 1614  }
1615 1615  
1616 1616  /*ARGSUSED*/
1617 1617  static int
1618 1618  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1619 1619      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1620 1620  {
1621 1621          rctl_qty_t q;
1622 1622          zone_t *z;
1623 1623  
1624 1624          z = e->rcep_p.zone;
1625 1625          ASSERT(MUTEX_HELD(&p->p_lock));
1626 1626          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1627 1627          q = z->zone_locked_mem;
1628 1628          if (q + incr > rcntl->rcv_value)
1629 1629                  return (1);
1630 1630          return (0);
1631 1631  }
1632 1632  
1633 1633  /*ARGSUSED*/
1634 1634  static int
1635 1635  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1636 1636      rctl_qty_t nv)
1637 1637  {
1638 1638          ASSERT(MUTEX_HELD(&p->p_lock));
1639 1639          ASSERT(e->rcep_t == RCENTITY_ZONE);
1640 1640          if (e->rcep_p.zone == NULL)
1641 1641                  return (0);
1642 1642          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1643 1643          return (0);
1644 1644  }
1645 1645  
1646 1646  static rctl_ops_t zone_locked_mem_ops = {
1647 1647          rcop_no_action,
1648 1648          zone_locked_mem_usage,
1649 1649          zone_locked_mem_set,
1650 1650          zone_locked_mem_test
1651 1651  };
1652 1652  
1653 1653  /*ARGSUSED*/
1654 1654  static rctl_qty_t
1655 1655  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1656 1656  {
1657 1657          rctl_qty_t q;
1658 1658          zone_t *z = p->p_zone;
1659 1659  
1660 1660          ASSERT(MUTEX_HELD(&p->p_lock));
1661 1661          mutex_enter(&z->zone_mem_lock);
1662 1662          q = z->zone_max_swap;
1663 1663          mutex_exit(&z->zone_mem_lock);
1664 1664          return (q);
1665 1665  }
1666 1666  
1667 1667  /*ARGSUSED*/
1668 1668  static int
1669 1669  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1670 1670      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1671 1671  {
1672 1672          rctl_qty_t q;
1673 1673          zone_t *z;
1674 1674  
1675 1675          z = e->rcep_p.zone;
1676 1676          ASSERT(MUTEX_HELD(&p->p_lock));
1677 1677          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1678 1678          q = z->zone_max_swap;
1679 1679          if (q + incr > rcntl->rcv_value)
1680 1680                  return (1);
1681 1681          return (0);
1682 1682  }
1683 1683  
1684 1684  /*ARGSUSED*/
1685 1685  static int
1686 1686  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1687 1687      rctl_qty_t nv)
1688 1688  {
1689 1689          ASSERT(MUTEX_HELD(&p->p_lock));
1690 1690          ASSERT(e->rcep_t == RCENTITY_ZONE);
1691 1691          if (e->rcep_p.zone == NULL)
1692 1692                  return (0);
1693 1693          e->rcep_p.zone->zone_max_swap_ctl = nv;
1694 1694          return (0);
1695 1695  }
1696 1696  
1697 1697  static rctl_ops_t zone_max_swap_ops = {
1698 1698          rcop_no_action,
1699 1699          zone_max_swap_usage,
1700 1700          zone_max_swap_set,
1701 1701          zone_max_swap_test
1702 1702  };
1703 1703  
1704 1704  /*ARGSUSED*/
1705 1705  static rctl_qty_t
1706 1706  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1707 1707  {
1708 1708          rctl_qty_t q;
1709 1709          zone_t *z = p->p_zone;
1710 1710  
1711 1711          ASSERT(MUTEX_HELD(&p->p_lock));
1712 1712          mutex_enter(&z->zone_rctl_lock);
1713 1713          q = z->zone_max_lofi;
1714 1714          mutex_exit(&z->zone_rctl_lock);
1715 1715          return (q);
1716 1716  }
1717 1717  
1718 1718  /*ARGSUSED*/
1719 1719  static int
1720 1720  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1721 1721      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1722 1722  {
1723 1723          rctl_qty_t q;
1724 1724          zone_t *z;
1725 1725  
1726 1726          z = e->rcep_p.zone;
1727 1727          ASSERT(MUTEX_HELD(&p->p_lock));
1728 1728          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1729 1729          q = z->zone_max_lofi;
1730 1730          if (q + incr > rcntl->rcv_value)
1731 1731                  return (1);
1732 1732          return (0);
1733 1733  }
1734 1734  
1735 1735  /*ARGSUSED*/
1736 1736  static int
1737 1737  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1738 1738      rctl_qty_t nv)
1739 1739  {
1740 1740          ASSERT(MUTEX_HELD(&p->p_lock));
1741 1741          ASSERT(e->rcep_t == RCENTITY_ZONE);
1742 1742          if (e->rcep_p.zone == NULL)
1743 1743                  return (0);
1744 1744          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1745 1745          return (0);
1746 1746  }
1747 1747  
1748 1748  static rctl_ops_t zone_max_lofi_ops = {
1749 1749          rcop_no_action,
1750 1750          zone_max_lofi_usage,
1751 1751          zone_max_lofi_set,
1752 1752          zone_max_lofi_test
1753 1753  };
1754 1754  
1755 1755  /*
1756 1756   * Helper function to brand the zone with a unique ID.
1757 1757   */
1758 1758  static void
1759 1759  zone_uniqid(zone_t *zone)
1760 1760  {
1761 1761          static uint64_t uniqid = 0;
1762 1762  
1763 1763          ASSERT(MUTEX_HELD(&zonehash_lock));
1764 1764          zone->zone_uniqid = uniqid++;
1765 1765  }
1766 1766  
1767 1767  /*
1768 1768   * Returns a held pointer to the "kcred" for the specified zone.
1769 1769   */
1770 1770  struct cred *
1771 1771  zone_get_kcred(zoneid_t zoneid)
1772 1772  {
1773 1773          zone_t *zone;
1774 1774          cred_t *cr;
1775 1775  
1776 1776          if ((zone = zone_find_by_id(zoneid)) == NULL)
1777 1777                  return (NULL);
1778 1778          cr = zone->zone_kcred;
1779 1779          crhold(cr);
1780 1780          zone_rele(zone);
1781 1781          return (cr);
1782 1782  }
1783 1783  
1784 1784  static int
1785 1785  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1786 1786  {
1787 1787          zone_t *zone = ksp->ks_private;
1788 1788          zone_kstat_t *zk = ksp->ks_data;
1789 1789  
1790 1790          if (rw == KSTAT_WRITE)
1791 1791                  return (EACCES);
1792 1792  
1793 1793          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1794 1794          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1795 1795          return (0);
1796 1796  }
1797 1797  
1798 1798  static int
1799 1799  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1800 1800  {
1801 1801          zone_t *zone = ksp->ks_private;
1802 1802          zone_kstat_t *zk = ksp->ks_data;
1803 1803  
1804 1804          if (rw == KSTAT_WRITE)
1805 1805                  return (EACCES);
1806 1806  
1807 1807          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1808 1808          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1809 1809          return (0);
1810 1810  }
1811 1811  
1812 1812  static int
1813 1813  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1814 1814  {
1815 1815          zone_t *zone = ksp->ks_private;
1816 1816          zone_kstat_t *zk = ksp->ks_data;
1817 1817  
1818 1818          if (rw == KSTAT_WRITE)
1819 1819                  return (EACCES);
1820 1820  
1821 1821          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1822 1822          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1823 1823          return (0);
1824 1824  }
1825 1825  
1826 1826  static kstat_t *
1827 1827  zone_kstat_create_common(zone_t *zone, char *name,
1828 1828      int (*updatefunc) (kstat_t *, int))
1829 1829  {
1830 1830          kstat_t *ksp;
1831 1831          zone_kstat_t *zk;
1832 1832  
1833 1833          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1834 1834              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1835 1835              KSTAT_FLAG_VIRTUAL);
1836 1836  
1837 1837          if (ksp == NULL)
1838 1838                  return (NULL);
1839 1839  
1840 1840          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1841 1841          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1842 1842          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1843 1843          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1844 1844          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1845 1845          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1846 1846          ksp->ks_update = updatefunc;
1847 1847          ksp->ks_private = zone;
1848 1848          kstat_install(ksp);
1849 1849          return (ksp);
1850 1850  }
1851 1851  
1852 1852  
1853 1853  static int
1854 1854  zone_mcap_kstat_update(kstat_t *ksp, int rw)
1855 1855  {
1856 1856          zone_t *zone = ksp->ks_private;
1857 1857          zone_mcap_kstat_t *zmp = ksp->ks_data;
1858 1858  
1859 1859          if (rw == KSTAT_WRITE)
1860 1860                  return (EACCES);
1861 1861  
1862 1862          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1863 1863          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1864 1864          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1865 1865          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1866 1866          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1867 1867  
1868 1868          return (0);
1869 1869  }
1870 1870  
1871 1871  static kstat_t *
1872 1872  zone_mcap_kstat_create(zone_t *zone)
1873 1873  {
1874 1874          kstat_t *ksp;
1875 1875          zone_mcap_kstat_t *zmp;
1876 1876  
1877 1877          if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1878 1878              zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1879 1879              sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1880 1880              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1881 1881                  return (NULL);
1882 1882  
1883 1883          if (zone->zone_id != GLOBAL_ZONEID)
1884 1884                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1885 1885  
1886 1886          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1887 1887          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1888 1888          ksp->ks_lock = &zone->zone_mcap_lock;
1889 1889          zone->zone_mcap_stats = zmp;
1890 1890  
1891 1891          /* The kstat "name" field is not large enough for a full zonename */
1892 1892          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1893 1893          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1894 1894          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1895 1895          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1896 1896          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1897 1897          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1898 1898          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1899 1899              KSTAT_DATA_UINT64);
1900 1900  
1901 1901          ksp->ks_update = zone_mcap_kstat_update;
1902 1902          ksp->ks_private = zone;
1903 1903  
1904 1904          kstat_install(ksp);
1905 1905          return (ksp);
1906 1906  }
1907 1907  
1908 1908  static int
1909 1909  zone_misc_kstat_update(kstat_t *ksp, int rw)
1910 1910  {
1911 1911          zone_t *zone = ksp->ks_private;
1912 1912          zone_misc_kstat_t *zmp = ksp->ks_data;
1913 1913          hrtime_t tmp;
1914 1914  
1915 1915          if (rw == KSTAT_WRITE)
1916 1916                  return (EACCES);
1917 1917  
1918 1918          tmp = zone->zone_utime;
1919 1919          scalehrtime(&tmp);
1920 1920          zmp->zm_utime.value.ui64 = tmp;
1921 1921          tmp = zone->zone_stime;
1922 1922          scalehrtime(&tmp);
1923 1923          zmp->zm_stime.value.ui64 = tmp;
1924 1924          tmp = zone->zone_wtime;
1925 1925          scalehrtime(&tmp);
1926 1926          zmp->zm_wtime.value.ui64 = tmp;
1927 1927  
1928 1928          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1929 1929          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1930 1930          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1931 1931  
1932 1932          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1933 1933          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1934 1934          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1935 1935          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1936 1936  
1937 1937          zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1938 1938  
1939 1939          zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1940 1940          zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1941 1941  
1942 1942          return (0);
1943 1943  }
1944 1944  
1945 1945  static kstat_t *
1946 1946  zone_misc_kstat_create(zone_t *zone)
1947 1947  {
1948 1948          kstat_t *ksp;
1949 1949          zone_misc_kstat_t *zmp;
1950 1950  
1951 1951          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1952 1952              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1953 1953              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1954 1954              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1955 1955                  return (NULL);
1956 1956  
1957 1957          if (zone->zone_id != GLOBAL_ZONEID)
1958 1958                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1959 1959  
1960 1960          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1961 1961          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1962 1962          ksp->ks_lock = &zone->zone_misc_lock;
1963 1963          zone->zone_misc_stats = zmp;
1964 1964  
1965 1965          /* The kstat "name" field is not large enough for a full zonename */
1966 1966          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1967 1967          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1968 1968          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1969 1969          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1970 1970          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1971 1971          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1972 1972          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1973 1973          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1974 1974              KSTAT_DATA_UINT32);
1975 1975          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1976 1976          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1977 1977              KSTAT_DATA_UINT32);
1978 1978          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1979 1979          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1980 1980          kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1981 1981              KSTAT_DATA_UINT32);
1982 1982          kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1983 1983          kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1984 1984  
1985 1985          ksp->ks_update = zone_misc_kstat_update;
1986 1986          ksp->ks_private = zone;
1987 1987  
1988 1988          kstat_install(ksp);
1989 1989          return (ksp);
1990 1990  }
1991 1991  
1992 1992  static void
1993 1993  zone_kstat_create(zone_t *zone)
1994 1994  {
1995 1995          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1996 1996              "lockedmem", zone_lockedmem_kstat_update);
1997 1997          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1998 1998              "swapresv", zone_swapresv_kstat_update);
1999 1999          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2000 2000              "nprocs", zone_nprocs_kstat_update);
2001 2001  
2002 2002          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2003 2003                  zone->zone_mcap_stats = kmem_zalloc(
2004 2004                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
2005 2005          }
2006 2006  
2007 2007          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2008 2008                  zone->zone_misc_stats = kmem_zalloc(
2009 2009                      sizeof (zone_misc_kstat_t), KM_SLEEP);
2010 2010          }
2011 2011  }
2012 2012  
2013 2013  static void
2014 2014  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2015 2015  {
2016 2016          void *data;
2017 2017  
2018 2018          if (*pkstat != NULL) {
2019 2019                  data = (*pkstat)->ks_data;
2020 2020                  kstat_delete(*pkstat);
2021 2021                  kmem_free(data, datasz);
2022 2022                  *pkstat = NULL;
2023 2023          }
2024 2024  }
2025 2025  
2026 2026  static void
2027 2027  zone_kstat_delete(zone_t *zone)
2028 2028  {
2029 2029          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2030 2030              sizeof (zone_kstat_t));
2031 2031          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2032 2032              sizeof (zone_kstat_t));
2033 2033          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2034 2034              sizeof (zone_kstat_t));
2035 2035          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2036 2036              sizeof (zone_mcap_kstat_t));
2037 2037          zone_kstat_delete_common(&zone->zone_misc_ksp,
2038 2038              sizeof (zone_misc_kstat_t));
2039 2039  }
2040 2040  
2041 2041  /*
2042 2042   * Called very early on in boot to initialize the ZSD list so that
2043 2043   * zone_key_create() can be called before zone_init().  It also initializes
2044 2044   * portions of zone0 which may be used before zone_init() is called.  The
2045 2045   * variable "global_zone" will be set when zone0 is fully initialized by
2046 2046   * zone_init().
2047 2047   */
2048 2048  void
2049 2049  zone_zsd_init(void)
2050 2050  {
2051 2051          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2052 2052          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2053 2053          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2054 2054              offsetof(struct zsd_entry, zsd_linkage));
2055 2055          list_create(&zone_active, sizeof (zone_t),
2056 2056              offsetof(zone_t, zone_linkage));
2057 2057          list_create(&zone_deathrow, sizeof (zone_t),
2058 2058              offsetof(zone_t, zone_linkage));
2059 2059  
2060 2060          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2061 2061          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2062 2062          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2063 2063          zone0.zone_shares = 1;
2064 2064          zone0.zone_nlwps = 0;
2065 2065          zone0.zone_nlwps_ctl = INT_MAX;
2066 2066          zone0.zone_nprocs = 0;
2067 2067          zone0.zone_nprocs_ctl = INT_MAX;
2068 2068          zone0.zone_locked_mem = 0;
2069 2069          zone0.zone_locked_mem_ctl = UINT64_MAX;
2070 2070          ASSERT(zone0.zone_max_swap == 0);
2071 2071          zone0.zone_max_swap_ctl = UINT64_MAX;
2072 2072          zone0.zone_max_lofi = 0;

↓ open down ↓

2072 lines elided

↑ open up ↑

2073 2073          zone0.zone_max_lofi_ctl = UINT64_MAX;
2074 2074          zone0.zone_shmmax = 0;
2075 2075          zone0.zone_ipc.ipcq_shmmni = 0;
2076 2076          zone0.zone_ipc.ipcq_semmni = 0;
2077 2077          zone0.zone_ipc.ipcq_msgmni = 0;
2078 2078          zone0.zone_name = GLOBAL_ZONENAME;
2079 2079          zone0.zone_nodename = utsname.nodename;
2080 2080          zone0.zone_domain = srpc_domain;
2081 2081          zone0.zone_hostid = HW_INVALID_HOSTID;
2082 2082          zone0.zone_fs_allowed = NULL;
     2083 +        psecflags_default(&zone0.zone_secflags);
2083 2084          zone0.zone_ref = 1;
2084 2085          zone0.zone_id = GLOBAL_ZONEID;
2085 2086          zone0.zone_status = ZONE_IS_RUNNING;
2086 2087          zone0.zone_rootpath = "/";
2087 2088          zone0.zone_rootpathlen = 2;
2088 2089          zone0.zone_psetid = ZONE_PS_INVAL;
2089 2090          zone0.zone_ncpus = 0;
2090 2091          zone0.zone_ncpus_online = 0;
2091 2092          zone0.zone_proc_initpid = 1;
2092 2093          zone0.zone_initname = initname;

2093 2094          zone0.zone_lockedmem_kstat = NULL;
2094 2095          zone0.zone_swapresv_kstat = NULL;
2095 2096          zone0.zone_nprocs_kstat = NULL;
2096 2097  
2097 2098          zone0.zone_stime = 0;
2098 2099          zone0.zone_utime = 0;
2099 2100          zone0.zone_wtime = 0;
2100 2101  
2101 2102          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2102 2103              offsetof(zone_ref_t, zref_linkage));
2103 2104          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2104 2105              offsetof(struct zsd_entry, zsd_linkage));
2105 2106          list_insert_head(&zone_active, &zone0);
2106 2107  
2107 2108          /*
2108 2109           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2109 2110           * to anything meaningful.  It is assigned to be 'rootdir' in
2110 2111           * vfs_mountroot().
2111 2112           */
2112 2113          zone0.zone_rootvp = NULL;
2113 2114          zone0.zone_vfslist = NULL;
2114 2115          zone0.zone_bootargs = initargs;
2115 2116          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2116 2117          /*
2117 2118           * The global zone has all privileges
2118 2119           */
2119 2120          priv_fillset(zone0.zone_privset);
2120 2121          /*
2121 2122           * Add p0 to the global zone
2122 2123           */
2123 2124          zone0.zone_zsched = &p0;
2124 2125          p0.p_zone = &zone0;
2125 2126  }
2126 2127  
2127 2128  /*
2128 2129   * Compute a hash value based on the contents of the label and the DOI.  The
2129 2130   * hash algorithm is somewhat arbitrary, but is based on the observation that
2130 2131   * humans will likely pick labels that differ by amounts that work out to be
2131 2132   * multiples of the number of hash chains, and thus stirring in some primes
2132 2133   * should help.
2133 2134   */
2134 2135  static uint_t
2135 2136  hash_bylabel(void *hdata, mod_hash_key_t key)
2136 2137  {
2137 2138          const ts_label_t *lab = (ts_label_t *)key;
2138 2139          const uint32_t *up, *ue;
2139 2140          uint_t hash;
2140 2141          int i;
2141 2142  
2142 2143          _NOTE(ARGUNUSED(hdata));
2143 2144  
2144 2145          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2145 2146          /* we depend on alignment of label, but not representation */
2146 2147          up = (const uint32_t *)&lab->tsl_label;
2147 2148          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2148 2149          i = 1;
2149 2150          while (up < ue) {
2150 2151                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2151 2152                  hash += *up + (*up << ((i % 16) + 1));
2152 2153                  up++;
2153 2154                  i++;
2154 2155          }
2155 2156          return (hash);
2156 2157  }
2157 2158  
2158 2159  /*
2159 2160   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2160 2161   * equal).  This may need to be changed if less than / greater than is ever
2161 2162   * needed.
2162 2163   */
2163 2164  static int
2164 2165  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2165 2166  {
2166 2167          ts_label_t *lab1 = (ts_label_t *)key1;
2167 2168          ts_label_t *lab2 = (ts_label_t *)key2;
2168 2169  
2169 2170          return (label_equal(lab1, lab2) ? 0 : 1);
2170 2171  }
2171 2172  
2172 2173  /*
2173 2174   * Called by main() to initialize the zones framework.
2174 2175   */
2175 2176  void
2176 2177  zone_init(void)
2177 2178  {
2178 2179          rctl_dict_entry_t *rde;
2179 2180          rctl_val_t *dval;
2180 2181          rctl_set_t *set;
2181 2182          rctl_alloc_gp_t *gp;
2182 2183          rctl_entity_p_t e;
2183 2184          int res;
2184 2185  
2185 2186          ASSERT(curproc == &p0);
2186 2187  
2187 2188          /*
2188 2189           * Create ID space for zone IDs.  ID 0 is reserved for the
2189 2190           * global zone.
2190 2191           */
2191 2192          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2192 2193  
2193 2194          /*
2194 2195           * Initialize generic zone resource controls, if any.
2195 2196           */
2196 2197          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2197 2198              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2198 2199              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2199 2200              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2200 2201  
2201 2202          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2202 2203              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2203 2204              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2204 2205              RCTL_GLOBAL_INFINITE,
2205 2206              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2206 2207  
2207 2208          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2208 2209              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2209 2210              INT_MAX, INT_MAX, &zone_lwps_ops);
2210 2211  
2211 2212          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2212 2213              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2213 2214              INT_MAX, INT_MAX, &zone_procs_ops);
2214 2215  
2215 2216          /*
2216 2217           * System V IPC resource controls
2217 2218           */
2218 2219          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2219 2220              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2220 2221              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2221 2222  
2222 2223          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2223 2224              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2224 2225              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2225 2226  
2226 2227          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2227 2228              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2228 2229              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2229 2230  
2230 2231          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2231 2232              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2232 2233              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2233 2234  
2234 2235          /*
2235 2236           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2236 2237           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2237 2238           */
2238 2239          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2239 2240          bzero(dval, sizeof (rctl_val_t));
2240 2241          dval->rcv_value = 1;
2241 2242          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2242 2243          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2243 2244          dval->rcv_action_recip_pid = -1;
2244 2245  
2245 2246          rde = rctl_dict_lookup("zone.cpu-shares");
2246 2247          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2247 2248  
2248 2249          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2249 2250              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2250 2251              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2251 2252              &zone_locked_mem_ops);
2252 2253  
2253 2254          rc_zone_max_swap = rctl_register("zone.max-swap",
2254 2255              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2255 2256              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2256 2257              &zone_max_swap_ops);
2257 2258  
2258 2259          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2259 2260              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2260 2261              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2261 2262              &zone_max_lofi_ops);
2262 2263  
2263 2264          /*
2264 2265           * Initialize the ``global zone''.
2265 2266           */
2266 2267          set = rctl_set_create();
2267 2268          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2268 2269          mutex_enter(&p0.p_lock);
2269 2270          e.rcep_p.zone = &zone0;
2270 2271          e.rcep_t = RCENTITY_ZONE;
2271 2272          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2272 2273              gp);
2273 2274  
2274 2275          zone0.zone_nlwps = p0.p_lwpcnt;
2275 2276          zone0.zone_nprocs = 1;
2276 2277          zone0.zone_ntasks = 1;
2277 2278          mutex_exit(&p0.p_lock);
2278 2279          zone0.zone_restart_init = B_TRUE;
2279 2280          zone0.zone_brand = &native_brand;
2280 2281          rctl_prealloc_destroy(gp);
2281 2282          /*
2282 2283           * pool_default hasn't been initialized yet, so we let pool_init()
2283 2284           * take care of making sure the global zone is in the default pool.
2284 2285           */
2285 2286  
2286 2287          /*
2287 2288           * Initialize global zone kstats
2288 2289           */
2289 2290          zone_kstat_create(&zone0);
2290 2291  
2291 2292          /*
2292 2293           * Initialize zone label.
2293 2294           * mlp are initialized when tnzonecfg is loaded.
2294 2295           */
2295 2296          zone0.zone_slabel = l_admin_low;
2296 2297          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2297 2298          label_hold(l_admin_low);
2298 2299  
2299 2300          /*
2300 2301           * Initialise the lock for the database structure used by mntfs.
2301 2302           */
2302 2303          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2303 2304  
2304 2305          mutex_enter(&zonehash_lock);
2305 2306          zone_uniqid(&zone0);
2306 2307          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2307 2308  
2308 2309          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2309 2310              mod_hash_null_valdtor);
2310 2311          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2311 2312              zone_hash_size, mod_hash_null_valdtor);
2312 2313          /*
2313 2314           * maintain zonehashbylabel only for labeled systems
2314 2315           */
2315 2316          if (is_system_labeled())
2316 2317                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2317 2318                      zone_hash_size, mod_hash_null_keydtor,
2318 2319                      mod_hash_null_valdtor, hash_bylabel, NULL,
2319 2320                      hash_labelkey_cmp, KM_SLEEP);
2320 2321          zonecount = 1;
2321 2322  
2322 2323          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2323 2324              (mod_hash_val_t)&zone0);
2324 2325          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2325 2326              (mod_hash_val_t)&zone0);
2326 2327          if (is_system_labeled()) {
2327 2328                  zone0.zone_flags |= ZF_HASHED_LABEL;
2328 2329                  (void) mod_hash_insert(zonehashbylabel,
2329 2330                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2330 2331          }
2331 2332          mutex_exit(&zonehash_lock);
2332 2333  
2333 2334          /*
2334 2335           * We avoid setting zone_kcred until now, since kcred is initialized
2335 2336           * sometime after zone_zsd_init() and before zone_init().
2336 2337           */
2337 2338          zone0.zone_kcred = kcred;
2338 2339          /*
2339 2340           * The global zone is fully initialized (except for zone_rootvp which
2340 2341           * will be set when the root filesystem is mounted).
2341 2342           */
2342 2343          global_zone = &zone0;
2343 2344  
2344 2345          /*
2345 2346           * Setup an event channel to send zone status change notifications on
2346 2347           */
2347 2348          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2348 2349              EVCH_CREAT);
2349 2350  
2350 2351          if (res)
2351 2352                  panic("Sysevent_evc_bind failed during zone setup.\n");
2352 2353  
2353 2354  }
2354 2355  
2355 2356  static void
2356 2357  zone_free(zone_t *zone)
2357 2358  {
2358 2359          ASSERT(zone != global_zone);
2359 2360          ASSERT(zone->zone_ntasks == 0);
2360 2361          ASSERT(zone->zone_nlwps == 0);
2361 2362          ASSERT(zone->zone_nprocs == 0);
2362 2363          ASSERT(zone->zone_cred_ref == 0);
2363 2364          ASSERT(zone->zone_kcred == NULL);
2364 2365          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2365 2366              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2366 2367          ASSERT(list_is_empty(&zone->zone_ref_list));
2367 2368  
2368 2369          /*
2369 2370           * Remove any zone caps.
2370 2371           */
2371 2372          cpucaps_zone_remove(zone);
2372 2373  
2373 2374          ASSERT(zone->zone_cpucap == NULL);
2374 2375  
2375 2376          /* remove from deathrow list */
2376 2377          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2377 2378                  ASSERT(zone->zone_ref == 0);
2378 2379                  mutex_enter(&zone_deathrow_lock);
2379 2380                  list_remove(&zone_deathrow, zone);
2380 2381                  mutex_exit(&zone_deathrow_lock);
2381 2382          }
2382 2383  
2383 2384          list_destroy(&zone->zone_ref_list);
2384 2385          zone_free_zsd(zone);
2385 2386          zone_free_datasets(zone);
2386 2387          list_destroy(&zone->zone_dl_list);
2387 2388  
2388 2389          if (zone->zone_rootvp != NULL)
2389 2390                  VN_RELE(zone->zone_rootvp);
2390 2391          if (zone->zone_rootpath)
2391 2392                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2392 2393          if (zone->zone_name != NULL)
2393 2394                  kmem_free(zone->zone_name, ZONENAME_MAX);
2394 2395          if (zone->zone_slabel != NULL)
2395 2396                  label_rele(zone->zone_slabel);
2396 2397          if (zone->zone_nodename != NULL)
2397 2398                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2398 2399          if (zone->zone_domain != NULL)
2399 2400                  kmem_free(zone->zone_domain, _SYS_NMLN);
2400 2401          if (zone->zone_privset != NULL)
2401 2402                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2402 2403          if (zone->zone_rctls != NULL)
2403 2404                  rctl_set_free(zone->zone_rctls);
2404 2405          if (zone->zone_bootargs != NULL)
2405 2406                  strfree(zone->zone_bootargs);
2406 2407          if (zone->zone_initname != NULL)
2407 2408                  strfree(zone->zone_initname);
2408 2409          if (zone->zone_fs_allowed != NULL)
2409 2410                  strfree(zone->zone_fs_allowed);
2410 2411          if (zone->zone_pfexecd != NULL)
2411 2412                  klpd_freelist(&zone->zone_pfexecd);
2412 2413          id_free(zoneid_space, zone->zone_id);
2413 2414          mutex_destroy(&zone->zone_lock);
2414 2415          cv_destroy(&zone->zone_cv);
2415 2416          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2416 2417          rw_destroy(&zone->zone_mntfs_db_lock);
2417 2418          kmem_free(zone, sizeof (zone_t));
2418 2419  }
2419 2420  
2420 2421  /*
2421 2422   * See block comment at the top of this file for information about zone
2422 2423   * status values.
2423 2424   */
2424 2425  /*
2425 2426   * Convenience function for setting zone status.
2426 2427   */
2427 2428  static void
2428 2429  zone_status_set(zone_t *zone, zone_status_t status)
2429 2430  {
2430 2431  
2431 2432          nvlist_t *nvl = NULL;
2432 2433          ASSERT(MUTEX_HELD(&zone_status_lock));
2433 2434          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2434 2435              status >= zone_status_get(zone));
2435 2436  
2436 2437          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2437 2438              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2438 2439              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2439 2440              zone_status_table[status]) ||
2440 2441              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2441 2442              zone_status_table[zone->zone_status]) ||
2442 2443              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2443 2444              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2444 2445              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2445 2446              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2446 2447  #ifdef DEBUG
2447 2448                  (void) printf(
2448 2449                      "Failed to allocate and send zone state change event.\n");
2449 2450  #endif
2450 2451          }
2451 2452          nvlist_free(nvl);
2452 2453  
2453 2454          zone->zone_status = status;
2454 2455  
2455 2456          cv_broadcast(&zone->zone_cv);
2456 2457  }
2457 2458  
2458 2459  /*
2459 2460   * Public function to retrieve the zone status.  The zone status may
2460 2461   * change after it is retrieved.
2461 2462   */
2462 2463  zone_status_t
2463 2464  zone_status_get(zone_t *zone)
2464 2465  {
2465 2466          return (zone->zone_status);
2466 2467  }
2467 2468  
2468 2469  static int
2469 2470  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2470 2471  {
2471 2472          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2472 2473          int err = 0;
2473 2474  
2474 2475          ASSERT(zone != global_zone);
2475 2476          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2476 2477                  goto done;      /* EFAULT or ENAMETOOLONG */
2477 2478  
2478 2479          if (zone->zone_bootargs != NULL)
2479 2480                  strfree(zone->zone_bootargs);
2480 2481  
2481 2482          zone->zone_bootargs = strdup(buf);
2482 2483  
2483 2484  done:
2484 2485          kmem_free(buf, BOOTARGS_MAX);
2485 2486          return (err);
2486 2487  }
2487 2488  
2488 2489  static int
2489 2490  zone_set_brand(zone_t *zone, const char *brand)
2490 2491  {
2491 2492          struct brand_attr *attrp;
2492 2493          brand_t *bp;
2493 2494  
2494 2495          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2495 2496          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2496 2497                  kmem_free(attrp, sizeof (struct brand_attr));
2497 2498                  return (EFAULT);
2498 2499          }
2499 2500  
2500 2501          bp = brand_register_zone(attrp);
2501 2502          kmem_free(attrp, sizeof (struct brand_attr));
2502 2503          if (bp == NULL)
2503 2504                  return (EINVAL);
2504 2505  
2505 2506          /*
2506 2507           * This is the only place where a zone can change it's brand.
2507 2508           * We already need to hold zone_status_lock to check the zone
2508 2509           * status, so we'll just use that lock to serialize zone
2509 2510           * branding requests as well.
2510 2511           */
2511 2512          mutex_enter(&zone_status_lock);
2512 2513  
2513 2514          /* Re-Branding is not allowed and the zone can't be booted yet */
2514 2515          if ((ZONE_IS_BRANDED(zone)) ||
2515 2516              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2516 2517                  mutex_exit(&zone_status_lock);
2517 2518                  brand_unregister_zone(bp);
2518 2519                  return (EINVAL);
2519 2520          }

↓ open down ↓

427 lines elided

↑ open up ↑

2520 2521  
2521 2522          /* set up the brand specific data */
2522 2523          zone->zone_brand = bp;
2523 2524          ZBROP(zone)->b_init_brand_data(zone);
2524 2525  
2525 2526          mutex_exit(&zone_status_lock);
2526 2527          return (0);
2527 2528  }
2528 2529  
2529 2530  static int
     2531 +zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
     2532 +{
     2533 +        int err = 0;
     2534 +        psecflags_t psf;
     2535 +
     2536 +        ASSERT(zone != global_zone);
     2537 +
     2538 +        if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
     2539 +                return (err);
     2540 +
     2541 +        if (zone_status_get(zone) > ZONE_IS_READY)
     2542 +                return (EINVAL);
     2543 +
     2544 +        if (!psecflags_validate(&psf))
     2545 +                return (EINVAL);
     2546 +
     2547 +        (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
     2548 +
     2549 +        /* Set security flags on the zone's zsched */
     2550 +        (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
     2551 +            sizeof (zone->zone_zsched->p_secflags));
     2552 +
     2553 +        return (0);
     2554 +}
     2555 +
     2556 +static int
2530 2557  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2531 2558  {
2532 2559          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2533 2560          int err = 0;
2534 2561  
2535 2562          ASSERT(zone != global_zone);
2536 2563          if ((err = copyinstr(zone_fs_allowed, buf,
2537 2564              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2538 2565                  goto done;
2539 2566

2540 2567          if (zone->zone_fs_allowed != NULL)
2541 2568                  strfree(zone->zone_fs_allowed);
2542 2569  
2543 2570          zone->zone_fs_allowed = strdup(buf);
2544 2571  
2545 2572  done:
2546 2573          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2547 2574          return (err);
2548 2575  }
2549 2576  
2550 2577  static int
2551 2578  zone_set_initname(zone_t *zone, const char *zone_initname)
2552 2579  {
2553 2580          char initname[INITNAME_SZ];
2554 2581          size_t len;
2555 2582          int err = 0;
2556 2583  
2557 2584          ASSERT(zone != global_zone);
2558 2585          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2559 2586                  return (err);   /* EFAULT or ENAMETOOLONG */
2560 2587  
2561 2588          if (zone->zone_initname != NULL)
2562 2589                  strfree(zone->zone_initname);
2563 2590  
2564 2591          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2565 2592          (void) strcpy(zone->zone_initname, initname);
2566 2593          return (0);
2567 2594  }
2568 2595  
2569 2596  static int
2570 2597  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2571 2598  {
2572 2599          uint64_t mcap;
2573 2600          int err = 0;
2574 2601  
2575 2602          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2576 2603                  zone->zone_phys_mcap = mcap;
2577 2604  
2578 2605          return (err);
2579 2606  }
2580 2607  
2581 2608  static int
2582 2609  zone_set_sched_class(zone_t *zone, const char *new_class)
2583 2610  {
2584 2611          char sched_class[PC_CLNMSZ];
2585 2612          id_t classid;
2586 2613          int err;
2587 2614  
2588 2615          ASSERT(zone != global_zone);
2589 2616          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2590 2617                  return (err);   /* EFAULT or ENAMETOOLONG */
2591 2618  
2592 2619          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2593 2620                  return (set_errno(EINVAL));
2594 2621          zone->zone_defaultcid = classid;
2595 2622          ASSERT(zone->zone_defaultcid > 0 &&
2596 2623              zone->zone_defaultcid < loaded_classes);
2597 2624  
2598 2625          return (0);
2599 2626  }
2600 2627  
2601 2628  /*
2602 2629   * Block indefinitely waiting for (zone_status >= status)
2603 2630   */
2604 2631  void
2605 2632  zone_status_wait(zone_t *zone, zone_status_t status)
2606 2633  {
2607 2634          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2608 2635  
2609 2636          mutex_enter(&zone_status_lock);
2610 2637          while (zone->zone_status < status) {
2611 2638                  cv_wait(&zone->zone_cv, &zone_status_lock);
2612 2639          }
2613 2640          mutex_exit(&zone_status_lock);
2614 2641  }
2615 2642  
2616 2643  /*
2617 2644   * Private CPR-safe version of zone_status_wait().
2618 2645   */
2619 2646  static void
2620 2647  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2621 2648  {
2622 2649          callb_cpr_t cprinfo;
2623 2650  
2624 2651          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2625 2652  
2626 2653          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2627 2654              str);
2628 2655          mutex_enter(&zone_status_lock);
2629 2656          while (zone->zone_status < status) {
2630 2657                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2631 2658                  cv_wait(&zone->zone_cv, &zone_status_lock);
2632 2659                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2633 2660          }
2634 2661          /*
2635 2662           * zone_status_lock is implicitly released by the following.
2636 2663           */
2637 2664          CALLB_CPR_EXIT(&cprinfo);
2638 2665  }
2639 2666  
2640 2667  /*
2641 2668   * Block until zone enters requested state or signal is received.  Return (0)
2642 2669   * if signaled, non-zero otherwise.
2643 2670   */
2644 2671  int
2645 2672  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2646 2673  {
2647 2674          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2648 2675  
2649 2676          mutex_enter(&zone_status_lock);
2650 2677          while (zone->zone_status < status) {
2651 2678                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2652 2679                          mutex_exit(&zone_status_lock);
2653 2680                          return (0);
2654 2681                  }
2655 2682          }
2656 2683          mutex_exit(&zone_status_lock);
2657 2684          return (1);
2658 2685  }
2659 2686  
2660 2687  /*
2661 2688   * Block until the zone enters the requested state or the timeout expires,
2662 2689   * whichever happens first.  Return (-1) if operation timed out, time remaining
2663 2690   * otherwise.
2664 2691   */
2665 2692  clock_t
2666 2693  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2667 2694  {
2668 2695          clock_t timeleft = 0;
2669 2696  
2670 2697          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2671 2698  
2672 2699          mutex_enter(&zone_status_lock);
2673 2700          while (zone->zone_status < status && timeleft != -1) {
2674 2701                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2675 2702          }
2676 2703          mutex_exit(&zone_status_lock);
2677 2704          return (timeleft);
2678 2705  }
2679 2706  
2680 2707  /*
2681 2708   * Block until the zone enters the requested state, the current process is
2682 2709   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2683 2710   * operation timed out, 0 if signaled, time remaining otherwise.
2684 2711   */
2685 2712  clock_t
2686 2713  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2687 2714  {
2688 2715          clock_t timeleft = tim - ddi_get_lbolt();
2689 2716  
2690 2717          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2691 2718  
2692 2719          mutex_enter(&zone_status_lock);
2693 2720          while (zone->zone_status < status) {
2694 2721                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2695 2722                      tim);
2696 2723                  if (timeleft <= 0)
2697 2724                          break;
2698 2725          }
2699 2726          mutex_exit(&zone_status_lock);
2700 2727          return (timeleft);
2701 2728  }
2702 2729  
2703 2730  /*
2704 2731   * Zones have two reference counts: one for references from credential
2705 2732   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2706 2733   * This is so we can allow a zone to be rebooted while there are still
2707 2734   * outstanding cred references, since certain drivers cache dblks (which
2708 2735   * implicitly results in cached creds).  We wait for zone_ref to drop to
2709 2736   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2710 2737   * later freed when the zone_cred_ref drops to 0, though nothing other
2711 2738   * than the zone id and privilege set should be accessed once the zone
2712 2739   * is "dead".
2713 2740   *
2714 2741   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2715 2742   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2716 2743   * to 0.  This can be useful to flush out other sources of cached creds
2717 2744   * that may be less innocuous than the driver case.
2718 2745   *
2719 2746   * Zones also provide a tracked reference counting mechanism in which zone
2720 2747   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2721 2748   * debuggers determine the sources of leaked zone references.  See
2722 2749   * zone_hold_ref() and zone_rele_ref() below for more information.
2723 2750   */
2724 2751  
2725 2752  int zone_wait_for_cred = 0;
2726 2753  
2727 2754  static void
2728 2755  zone_hold_locked(zone_t *z)
2729 2756  {
2730 2757          ASSERT(MUTEX_HELD(&z->zone_lock));
2731 2758          z->zone_ref++;
2732 2759          ASSERT(z->zone_ref != 0);
2733 2760  }
2734 2761  
2735 2762  /*
2736 2763   * Increment the specified zone's reference count.  The zone's zone_t structure
2737 2764   * will not be freed as long as the zone's reference count is nonzero.
2738 2765   * Decrement the zone's reference count via zone_rele().
2739 2766   *
2740 2767   * NOTE: This function should only be used to hold zones for short periods of
2741 2768   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2742 2769   */
2743 2770  void
2744 2771  zone_hold(zone_t *z)
2745 2772  {
2746 2773          mutex_enter(&z->zone_lock);
2747 2774          zone_hold_locked(z);
2748 2775          mutex_exit(&z->zone_lock);
2749 2776  }
2750 2777  
2751 2778  /*
2752 2779   * If the non-cred ref count drops to 1 and either the cred ref count
2753 2780   * is 0 or we aren't waiting for cred references, the zone is ready to
2754 2781   * be destroyed.
2755 2782   */
2756 2783  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2757 2784              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2758 2785  
2759 2786  /*
2760 2787   * Common zone reference release function invoked by zone_rele() and
2761 2788   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2762 2789   * zone's subsystem-specific reference counters are not affected by the
2763 2790   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2764 2791   * removed from the specified zone's reference list.  ref must be non-NULL iff
2765 2792   * subsys is not ZONE_REF_NUM_SUBSYS.
2766 2793   */
2767 2794  static void
2768 2795  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2769 2796  {
2770 2797          boolean_t wakeup;
2771 2798  
2772 2799          mutex_enter(&z->zone_lock);
2773 2800          ASSERT(z->zone_ref != 0);
2774 2801          z->zone_ref--;
2775 2802          if (subsys != ZONE_REF_NUM_SUBSYS) {
2776 2803                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2777 2804                  z->zone_subsys_ref[subsys]--;
2778 2805                  list_remove(&z->zone_ref_list, ref);
2779 2806          }
2780 2807          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2781 2808                  /* no more refs, free the structure */
2782 2809                  mutex_exit(&z->zone_lock);
2783 2810                  zone_free(z);
2784 2811                  return;
2785 2812          }
2786 2813          /* signal zone_destroy so the zone can finish halting */
2787 2814          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2788 2815          mutex_exit(&z->zone_lock);
2789 2816  
2790 2817          if (wakeup) {
2791 2818                  /*
2792 2819                   * Grabbing zonehash_lock here effectively synchronizes with
2793 2820                   * zone_destroy() to avoid missed signals.
2794 2821                   */
2795 2822                  mutex_enter(&zonehash_lock);
2796 2823                  cv_broadcast(&zone_destroy_cv);
2797 2824                  mutex_exit(&zonehash_lock);
2798 2825          }
2799 2826  }
2800 2827  
2801 2828  /*
2802 2829   * Decrement the specified zone's reference count.  The specified zone will
2803 2830   * cease to exist after this function returns if the reference count drops to
2804 2831   * zero.  This function should be paired with zone_hold().
2805 2832   */
2806 2833  void
2807 2834  zone_rele(zone_t *z)
2808 2835  {
2809 2836          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2810 2837  }
2811 2838  
2812 2839  /*
2813 2840   * Initialize a zone reference structure.  This function must be invoked for
2814 2841   * a reference structure before the structure is passed to zone_hold_ref().
2815 2842   */
2816 2843  void
2817 2844  zone_init_ref(zone_ref_t *ref)
2818 2845  {
2819 2846          ref->zref_zone = NULL;
2820 2847          list_link_init(&ref->zref_linkage);
2821 2848  }
2822 2849  
2823 2850  /*
2824 2851   * Acquire a reference to zone z.  The caller must specify the
2825 2852   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2826 2853   * zone_ref_t structure will represent a reference to the specified zone.  Use
2827 2854   * zone_rele_ref() to release the reference.
2828 2855   *
2829 2856   * The referenced zone_t structure will not be freed as long as the zone_t's
2830 2857   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2831 2858   * references.
2832 2859   *
2833 2860   * NOTE: The zone_ref_t structure must be initialized before it is used.
2834 2861   * See zone_init_ref() above.
2835 2862   */
2836 2863  void
2837 2864  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2838 2865  {
2839 2866          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2840 2867  
2841 2868          /*
2842 2869           * Prevent consumers from reusing a reference structure before
2843 2870           * releasing it.
2844 2871           */
2845 2872          VERIFY(ref->zref_zone == NULL);
2846 2873  
2847 2874          ref->zref_zone = z;
2848 2875          mutex_enter(&z->zone_lock);
2849 2876          zone_hold_locked(z);
2850 2877          z->zone_subsys_ref[subsys]++;
2851 2878          ASSERT(z->zone_subsys_ref[subsys] != 0);
2852 2879          list_insert_head(&z->zone_ref_list, ref);
2853 2880          mutex_exit(&z->zone_lock);
2854 2881  }
2855 2882  
2856 2883  /*
2857 2884   * Release the zone reference represented by the specified zone_ref_t.
2858 2885   * The reference is invalid after it's released; however, the zone_ref_t
2859 2886   * structure can be reused without having to invoke zone_init_ref().
2860 2887   * subsys should be the same value that was passed to zone_hold_ref()
2861 2888   * when the reference was acquired.
2862 2889   */
2863 2890  void
2864 2891  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2865 2892  {
2866 2893          zone_rele_common(ref->zref_zone, ref, subsys);
2867 2894  
2868 2895          /*
2869 2896           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2870 2897           * when consumers dereference the reference.  This helps us catch
2871 2898           * consumers who use released references.  Furthermore, this lets
2872 2899           * consumers reuse the zone_ref_t structure without having to
2873 2900           * invoke zone_init_ref().
2874 2901           */
2875 2902          ref->zref_zone = NULL;
2876 2903  }
2877 2904  
2878 2905  void
2879 2906  zone_cred_hold(zone_t *z)
2880 2907  {
2881 2908          mutex_enter(&z->zone_lock);
2882 2909          z->zone_cred_ref++;
2883 2910          ASSERT(z->zone_cred_ref != 0);
2884 2911          mutex_exit(&z->zone_lock);
2885 2912  }
2886 2913  
2887 2914  void
2888 2915  zone_cred_rele(zone_t *z)
2889 2916  {
2890 2917          boolean_t wakeup;
2891 2918  
2892 2919          mutex_enter(&z->zone_lock);
2893 2920          ASSERT(z->zone_cred_ref != 0);
2894 2921          z->zone_cred_ref--;
2895 2922          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2896 2923                  /* no more refs, free the structure */
2897 2924                  mutex_exit(&z->zone_lock);
2898 2925                  zone_free(z);
2899 2926                  return;
2900 2927          }
2901 2928          /*
2902 2929           * If zone_destroy is waiting for the cred references to drain
2903 2930           * out, and they have, signal it.
2904 2931           */
2905 2932          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2906 2933              zone_status_get(z) >= ZONE_IS_DEAD);
2907 2934          mutex_exit(&z->zone_lock);
2908 2935  
2909 2936          if (wakeup) {
2910 2937                  /*
2911 2938                   * Grabbing zonehash_lock here effectively synchronizes with
2912 2939                   * zone_destroy() to avoid missed signals.
2913 2940                   */
2914 2941                  mutex_enter(&zonehash_lock);
2915 2942                  cv_broadcast(&zone_destroy_cv);
2916 2943                  mutex_exit(&zonehash_lock);
2917 2944          }
2918 2945  }
2919 2946  
2920 2947  void
2921 2948  zone_task_hold(zone_t *z)
2922 2949  {
2923 2950          mutex_enter(&z->zone_lock);
2924 2951          z->zone_ntasks++;
2925 2952          ASSERT(z->zone_ntasks != 0);
2926 2953          mutex_exit(&z->zone_lock);
2927 2954  }
2928 2955  
2929 2956  void
2930 2957  zone_task_rele(zone_t *zone)
2931 2958  {
2932 2959          uint_t refcnt;
2933 2960  
2934 2961          mutex_enter(&zone->zone_lock);
2935 2962          ASSERT(zone->zone_ntasks != 0);
2936 2963          refcnt = --zone->zone_ntasks;
2937 2964          if (refcnt > 1) {       /* Common case */
2938 2965                  mutex_exit(&zone->zone_lock);
2939 2966                  return;
2940 2967          }
2941 2968          zone_hold_locked(zone); /* so we can use the zone_t later */
2942 2969          mutex_exit(&zone->zone_lock);
2943 2970          if (refcnt == 1) {
2944 2971                  /*
2945 2972                   * See if the zone is shutting down.
2946 2973                   */
2947 2974                  mutex_enter(&zone_status_lock);
2948 2975                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2949 2976                          goto out;
2950 2977                  }
2951 2978  
2952 2979                  /*
2953 2980                   * Make sure the ntasks didn't change since we
2954 2981                   * dropped zone_lock.
2955 2982                   */
2956 2983                  mutex_enter(&zone->zone_lock);
2957 2984                  if (refcnt != zone->zone_ntasks) {
2958 2985                          mutex_exit(&zone->zone_lock);
2959 2986                          goto out;
2960 2987                  }
2961 2988                  mutex_exit(&zone->zone_lock);
2962 2989  
2963 2990                  /*
2964 2991                   * No more user processes in the zone.  The zone is empty.
2965 2992                   */
2966 2993                  zone_status_set(zone, ZONE_IS_EMPTY);
2967 2994                  goto out;
2968 2995          }
2969 2996  
2970 2997          ASSERT(refcnt == 0);
2971 2998          /*
2972 2999           * zsched has exited; the zone is dead.
2973 3000           */
2974 3001          zone->zone_zsched = NULL;               /* paranoia */
2975 3002          mutex_enter(&zone_status_lock);
2976 3003          zone_status_set(zone, ZONE_IS_DEAD);
2977 3004  out:
2978 3005          mutex_exit(&zone_status_lock);
2979 3006          zone_rele(zone);
2980 3007  }
2981 3008  
2982 3009  zoneid_t
2983 3010  getzoneid(void)
2984 3011  {
2985 3012          return (curproc->p_zone->zone_id);
2986 3013  }
2987 3014  
2988 3015  /*
2989 3016   * Internal versions of zone_find_by_*().  These don't zone_hold() or
2990 3017   * check the validity of a zone's state.
2991 3018   */
2992 3019  static zone_t *
2993 3020  zone_find_all_by_id(zoneid_t zoneid)
2994 3021  {
2995 3022          mod_hash_val_t hv;
2996 3023          zone_t *zone = NULL;
2997 3024  
2998 3025          ASSERT(MUTEX_HELD(&zonehash_lock));
2999 3026  
3000 3027          if (mod_hash_find(zonehashbyid,
3001 3028              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3002 3029                  zone = (zone_t *)hv;
3003 3030          return (zone);
3004 3031  }
3005 3032  
3006 3033  static zone_t *
3007 3034  zone_find_all_by_label(const ts_label_t *label)
3008 3035  {
3009 3036          mod_hash_val_t hv;
3010 3037          zone_t *zone = NULL;
3011 3038  
3012 3039          ASSERT(MUTEX_HELD(&zonehash_lock));
3013 3040  
3014 3041          /*
3015 3042           * zonehashbylabel is not maintained for unlabeled systems
3016 3043           */
3017 3044          if (!is_system_labeled())
3018 3045                  return (NULL);
3019 3046          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3020 3047                  zone = (zone_t *)hv;
3021 3048          return (zone);
3022 3049  }
3023 3050  
3024 3051  static zone_t *
3025 3052  zone_find_all_by_name(char *name)
3026 3053  {
3027 3054          mod_hash_val_t hv;
3028 3055          zone_t *zone = NULL;
3029 3056  
3030 3057          ASSERT(MUTEX_HELD(&zonehash_lock));
3031 3058  
3032 3059          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3033 3060                  zone = (zone_t *)hv;
3034 3061          return (zone);
3035 3062  }
3036 3063  
3037 3064  /*
3038 3065   * Public interface for looking up a zone by zoneid.  Only returns the zone if
3039 3066   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3040 3067   * Caller must call zone_rele() once it is done with the zone.
3041 3068   *
3042 3069   * The zone may begin the zone_destroy() sequence immediately after this
3043 3070   * function returns, but may be safely used until zone_rele() is called.
3044 3071   */
3045 3072  zone_t *
3046 3073  zone_find_by_id(zoneid_t zoneid)
3047 3074  {
3048 3075          zone_t *zone;
3049 3076          zone_status_t status;
3050 3077  
3051 3078          mutex_enter(&zonehash_lock);
3052 3079          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3053 3080                  mutex_exit(&zonehash_lock);
3054 3081                  return (NULL);
3055 3082          }
3056 3083          status = zone_status_get(zone);
3057 3084          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3058 3085                  /*
3059 3086                   * For all practical purposes the zone doesn't exist.
3060 3087                   */
3061 3088                  mutex_exit(&zonehash_lock);
3062 3089                  return (NULL);
3063 3090          }
3064 3091          zone_hold(zone);
3065 3092          mutex_exit(&zonehash_lock);
3066 3093          return (zone);
3067 3094  }
3068 3095  
3069 3096  /*
3070 3097   * Similar to zone_find_by_id, but using zone label as the key.
3071 3098   */
3072 3099  zone_t *
3073 3100  zone_find_by_label(const ts_label_t *label)
3074 3101  {
3075 3102          zone_t *zone;
3076 3103          zone_status_t status;
3077 3104  
3078 3105          mutex_enter(&zonehash_lock);
3079 3106          if ((zone = zone_find_all_by_label(label)) == NULL) {
3080 3107                  mutex_exit(&zonehash_lock);
3081 3108                  return (NULL);
3082 3109          }
3083 3110  
3084 3111          status = zone_status_get(zone);
3085 3112          if (status > ZONE_IS_DOWN) {
3086 3113                  /*
3087 3114                   * For all practical purposes the zone doesn't exist.
3088 3115                   */
3089 3116                  mutex_exit(&zonehash_lock);
3090 3117                  return (NULL);
3091 3118          }
3092 3119          zone_hold(zone);
3093 3120          mutex_exit(&zonehash_lock);
3094 3121          return (zone);
3095 3122  }
3096 3123  
3097 3124  /*
3098 3125   * Similar to zone_find_by_id, but using zone name as the key.
3099 3126   */
3100 3127  zone_t *
3101 3128  zone_find_by_name(char *name)
3102 3129  {
3103 3130          zone_t *zone;
3104 3131          zone_status_t status;
3105 3132  
3106 3133          mutex_enter(&zonehash_lock);
3107 3134          if ((zone = zone_find_all_by_name(name)) == NULL) {
3108 3135                  mutex_exit(&zonehash_lock);
3109 3136                  return (NULL);
3110 3137          }
3111 3138          status = zone_status_get(zone);
3112 3139          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3113 3140                  /*
3114 3141                   * For all practical purposes the zone doesn't exist.
3115 3142                   */
3116 3143                  mutex_exit(&zonehash_lock);
3117 3144                  return (NULL);
3118 3145          }
3119 3146          zone_hold(zone);
3120 3147          mutex_exit(&zonehash_lock);
3121 3148          return (zone);
3122 3149  }
3123 3150  
3124 3151  /*
3125 3152   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3126 3153   * if there is a zone "foo" rooted at /foo/root, and the path argument
3127 3154   * is "/foo/root/proc", it will return the held zone_t corresponding to
3128 3155   * zone "foo".
3129 3156   *
3130 3157   * zone_find_by_path() always returns a non-NULL value, since at the
3131 3158   * very least every path will be contained in the global zone.
3132 3159   *
3133 3160   * As with the other zone_find_by_*() functions, the caller is
3134 3161   * responsible for zone_rele()ing the return value of this function.
3135 3162   */
3136 3163  zone_t *
3137 3164  zone_find_by_path(const char *path)
3138 3165  {
3139 3166          zone_t *zone;
3140 3167          zone_t *zret = NULL;
3141 3168          zone_status_t status;
3142 3169  
3143 3170          if (path == NULL) {
3144 3171                  /*
3145 3172                   * Call from rootconf().
3146 3173                   */
3147 3174                  zone_hold(global_zone);
3148 3175                  return (global_zone);
3149 3176          }
3150 3177          ASSERT(*path == '/');
3151 3178          mutex_enter(&zonehash_lock);
3152 3179          for (zone = list_head(&zone_active); zone != NULL;
3153 3180              zone = list_next(&zone_active, zone)) {
3154 3181                  if (ZONE_PATH_VISIBLE(path, zone))
3155 3182                          zret = zone;
3156 3183          }
3157 3184          ASSERT(zret != NULL);
3158 3185          status = zone_status_get(zret);
3159 3186          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3160 3187                  /*
3161 3188                   * Zone practically doesn't exist.
3162 3189                   */
3163 3190                  zret = global_zone;
3164 3191          }
3165 3192          zone_hold(zret);
3166 3193          mutex_exit(&zonehash_lock);
3167 3194          return (zret);
3168 3195  }
3169 3196  
3170 3197  /*
3171 3198   * Public interface for updating per-zone load averages.  Called once per
3172 3199   * second.
3173 3200   *
3174 3201   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3175 3202   */
3176 3203  void
3177 3204  zone_loadavg_update()
3178 3205  {
3179 3206          zone_t *zp;
3180 3207          zone_status_t status;
3181 3208          struct loadavg_s *lavg;
3182 3209          hrtime_t zone_total;
3183 3210          int i;
3184 3211          hrtime_t hr_avg;
3185 3212          int nrun;
3186 3213          static int64_t f[3] = { 135, 27, 9 };
3187 3214          int64_t q, r;
3188 3215  
3189 3216          mutex_enter(&zonehash_lock);
3190 3217          for (zp = list_head(&zone_active); zp != NULL;
3191 3218              zp = list_next(&zone_active, zp)) {
3192 3219                  mutex_enter(&zp->zone_lock);
3193 3220  
3194 3221                  /* Skip zones that are on the way down or not yet up */
3195 3222                  status = zone_status_get(zp);
3196 3223                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3197 3224                          /* For all practical purposes the zone doesn't exist. */
3198 3225                          mutex_exit(&zp->zone_lock);
3199 3226                          continue;
3200 3227                  }
3201 3228  
3202 3229                  /*
3203 3230                   * Update the 10 second moving average data in zone_loadavg.
3204 3231                   */
3205 3232                  lavg = &zp->zone_loadavg;
3206 3233  
3207 3234                  zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3208 3235                  scalehrtime(&zone_total);
3209 3236  
3210 3237                  /* The zone_total should always be increasing. */
3211 3238                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3212 3239                      zone_total - lavg->lg_total : 0;
3213 3240                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3214 3241                  /* lg_total holds the prev. 1 sec. total */
3215 3242                  lavg->lg_total = zone_total;
3216 3243  
3217 3244                  /*
3218 3245                   * To simplify the calculation, we don't calculate the load avg.
3219 3246                   * until the zone has been up for at least 10 seconds and our
3220 3247                   * moving average is thus full.
3221 3248                   */
3222 3249                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3223 3250                          lavg->lg_len++;
3224 3251                          mutex_exit(&zp->zone_lock);
3225 3252                          continue;
3226 3253                  }
3227 3254  
3228 3255                  /* Now calculate the 1min, 5min, 15 min load avg. */
3229 3256                  hr_avg = 0;
3230 3257                  for (i = 0; i < S_LOADAVG_SZ; i++)
3231 3258                          hr_avg += lavg->lg_loads[i];
3232 3259                  hr_avg = hr_avg / S_LOADAVG_SZ;
3233 3260                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3234 3261  
3235 3262                  /* Compute load avg. See comment in calcloadavg() */
3236 3263                  for (i = 0; i < 3; i++) {
3237 3264                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3238 3265                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3239 3266                          zp->zone_hp_avenrun[i] +=
3240 3267                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3241 3268  
3242 3269                          /* avenrun[] can only hold 31 bits of load avg. */
3243 3270                          if (zp->zone_hp_avenrun[i] <
3244 3271                              ((uint64_t)1<<(31+16-FSHIFT)))
3245 3272                                  zp->zone_avenrun[i] = (int32_t)
3246 3273                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3247 3274                          else
3248 3275                                  zp->zone_avenrun[i] = 0x7fffffff;
3249 3276                  }
3250 3277  
3251 3278                  mutex_exit(&zp->zone_lock);
3252 3279          }
3253 3280          mutex_exit(&zonehash_lock);
3254 3281  }
3255 3282  
3256 3283  /*
3257 3284   * Get the number of cpus visible to this zone.  The system-wide global
3258 3285   * 'ncpus' is returned if pools are disabled, the caller is in the
3259 3286   * global zone, or a NULL zone argument is passed in.
3260 3287   */
3261 3288  int
3262 3289  zone_ncpus_get(zone_t *zone)
3263 3290  {
3264 3291          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3265 3292  
3266 3293          return (myncpus != 0 ? myncpus : ncpus);
3267 3294  }
3268 3295  
3269 3296  /*
3270 3297   * Get the number of online cpus visible to this zone.  The system-wide
3271 3298   * global 'ncpus_online' is returned if pools are disabled, the caller
3272 3299   * is in the global zone, or a NULL zone argument is passed in.
3273 3300   */
3274 3301  int
3275 3302  zone_ncpus_online_get(zone_t *zone)
3276 3303  {
3277 3304          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3278 3305  
3279 3306          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3280 3307  }
3281 3308  
3282 3309  /*
3283 3310   * Return the pool to which the zone is currently bound.
3284 3311   */
3285 3312  pool_t *
3286 3313  zone_pool_get(zone_t *zone)
3287 3314  {
3288 3315          ASSERT(pool_lock_held());
3289 3316  
3290 3317          return (zone->zone_pool);
3291 3318  }
3292 3319  
3293 3320  /*
3294 3321   * Set the zone's pool pointer and update the zone's visibility to match
3295 3322   * the resources in the new pool.
3296 3323   */
3297 3324  void
3298 3325  zone_pool_set(zone_t *zone, pool_t *pool)
3299 3326  {
3300 3327          ASSERT(pool_lock_held());
3301 3328          ASSERT(MUTEX_HELD(&cpu_lock));
3302 3329  
3303 3330          zone->zone_pool = pool;
3304 3331          zone_pset_set(zone, pool->pool_pset->pset_id);
3305 3332  }
3306 3333  
3307 3334  /*
3308 3335   * Return the cached value of the id of the processor set to which the
3309 3336   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3310 3337   * facility is disabled.
3311 3338   */
3312 3339  psetid_t
3313 3340  zone_pset_get(zone_t *zone)
3314 3341  {
3315 3342          ASSERT(MUTEX_HELD(&cpu_lock));
3316 3343  
3317 3344          return (zone->zone_psetid);
3318 3345  }
3319 3346  
3320 3347  /*
3321 3348   * Set the cached value of the id of the processor set to which the zone
3322 3349   * is currently bound.  Also update the zone's visibility to match the
3323 3350   * resources in the new processor set.
3324 3351   */
3325 3352  void
3326 3353  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3327 3354  {
3328 3355          psetid_t oldpsetid;
3329 3356  
3330 3357          ASSERT(MUTEX_HELD(&cpu_lock));
3331 3358          oldpsetid = zone_pset_get(zone);
3332 3359  
3333 3360          if (oldpsetid == newpsetid)
3334 3361                  return;
3335 3362          /*
3336 3363           * Global zone sees all.
3337 3364           */
3338 3365          if (zone != global_zone) {
3339 3366                  zone->zone_psetid = newpsetid;
3340 3367                  if (newpsetid != ZONE_PS_INVAL)
3341 3368                          pool_pset_visibility_add(newpsetid, zone);
3342 3369                  if (oldpsetid != ZONE_PS_INVAL)
3343 3370                          pool_pset_visibility_remove(oldpsetid, zone);
3344 3371          }
3345 3372          /*
3346 3373           * Disabling pools, so we should start using the global values
3347 3374           * for ncpus and ncpus_online.
3348 3375           */
3349 3376          if (newpsetid == ZONE_PS_INVAL) {
3350 3377                  zone->zone_ncpus = 0;
3351 3378                  zone->zone_ncpus_online = 0;
3352 3379          }
3353 3380  }
3354 3381  
3355 3382  /*
3356 3383   * Walk the list of active zones and issue the provided callback for
3357 3384   * each of them.
3358 3385   *
3359 3386   * Caller must not be holding any locks that may be acquired under
3360 3387   * zonehash_lock.  See comment at the beginning of the file for a list of
3361 3388   * common locks and their interactions with zones.
3362 3389   */
3363 3390  int
3364 3391  zone_walk(int (*cb)(zone_t *, void *), void *data)
3365 3392  {
3366 3393          zone_t *zone;
3367 3394          int ret = 0;
3368 3395          zone_status_t status;
3369 3396  
3370 3397          mutex_enter(&zonehash_lock);
3371 3398          for (zone = list_head(&zone_active); zone != NULL;
3372 3399              zone = list_next(&zone_active, zone)) {
3373 3400                  /*
3374 3401                   * Skip zones that shouldn't be externally visible.
3375 3402                   */
3376 3403                  status = zone_status_get(zone);
3377 3404                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3378 3405                          continue;
3379 3406                  /*
3380 3407                   * Bail immediately if any callback invocation returns a
3381 3408                   * non-zero value.
3382 3409                   */
3383 3410                  ret = (*cb)(zone, data);
3384 3411                  if (ret != 0)
3385 3412                          break;
3386 3413          }
3387 3414          mutex_exit(&zonehash_lock);
3388 3415          return (ret);
3389 3416  }
3390 3417  
3391 3418  static int
3392 3419  zone_set_root(zone_t *zone, const char *upath)
3393 3420  {
3394 3421          vnode_t *vp;
3395 3422          int trycount;
3396 3423          int error = 0;
3397 3424          char *path;
3398 3425          struct pathname upn, pn;
3399 3426          size_t pathlen;
3400 3427  
3401 3428          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3402 3429                  return (error);
3403 3430  
3404 3431          pn_alloc(&pn);
3405 3432  
3406 3433          /* prevent infinite loop */
3407 3434          trycount = 10;
3408 3435          for (;;) {
3409 3436                  if (--trycount <= 0) {
3410 3437                          error = ESTALE;
3411 3438                          goto out;
3412 3439                  }
3413 3440  
3414 3441                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3415 3442                          /*
3416 3443                           * VOP_ACCESS() may cover 'vp' with a new
3417 3444                           * filesystem, if 'vp' is an autoFS vnode.
3418 3445                           * Get the new 'vp' if so.
3419 3446                           */
3420 3447                          if ((error =
3421 3448                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3422 3449                              (!vn_ismntpt(vp) ||
3423 3450                              (error = traverse(&vp)) == 0)) {
3424 3451                                  pathlen = pn.pn_pathlen + 2;
3425 3452                                  path = kmem_alloc(pathlen, KM_SLEEP);
3426 3453                                  (void) strncpy(path, pn.pn_path,
3427 3454                                      pn.pn_pathlen + 1);
3428 3455                                  path[pathlen - 2] = '/';
3429 3456                                  path[pathlen - 1] = '\0';
3430 3457                                  pn_free(&pn);
3431 3458                                  pn_free(&upn);
3432 3459  
3433 3460                                  /* Success! */
3434 3461                                  break;
3435 3462                          }
3436 3463                          VN_RELE(vp);
3437 3464                  }
3438 3465                  if (error != ESTALE)
3439 3466                          goto out;
3440 3467          }
3441 3468  
3442 3469          ASSERT(error == 0);
3443 3470          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3444 3471          zone->zone_rootpath = path;
3445 3472          zone->zone_rootpathlen = pathlen;
3446 3473          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3447 3474                  zone->zone_flags |= ZF_IS_SCRATCH;
3448 3475          return (0);
3449 3476  
3450 3477  out:
3451 3478          pn_free(&pn);
3452 3479          pn_free(&upn);
3453 3480          return (error);
3454 3481  }
3455 3482  
3456 3483  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3457 3484                          ((c) >= 'a' && (c) <= 'z') || \
3458 3485                          ((c) >= 'A' && (c) <= 'Z'))
3459 3486  
3460 3487  static int
3461 3488  zone_set_name(zone_t *zone, const char *uname)
3462 3489  {
3463 3490          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3464 3491          size_t len;
3465 3492          int i, err;
3466 3493  
3467 3494          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3468 3495                  kmem_free(kname, ZONENAME_MAX);
3469 3496                  return (err);   /* EFAULT or ENAMETOOLONG */
3470 3497          }
3471 3498  
3472 3499          /* must be less than ZONENAME_MAX */
3473 3500          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3474 3501                  kmem_free(kname, ZONENAME_MAX);
3475 3502                  return (EINVAL);
3476 3503          }
3477 3504  
3478 3505          /*
3479 3506           * Name must start with an alphanumeric and must contain only
3480 3507           * alphanumerics, '-', '_' and '.'.
3481 3508           */
3482 3509          if (!isalnum(kname[0])) {
3483 3510                  kmem_free(kname, ZONENAME_MAX);
3484 3511                  return (EINVAL);
3485 3512          }
3486 3513          for (i = 1; i < len - 1; i++) {
3487 3514                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3488 3515                      kname[i] != '.') {
3489 3516                          kmem_free(kname, ZONENAME_MAX);
3490 3517                          return (EINVAL);
3491 3518                  }
3492 3519          }
3493 3520  
3494 3521          zone->zone_name = kname;
3495 3522          return (0);
3496 3523  }
3497 3524  
3498 3525  /*
3499 3526   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3500 3527   * is NULL or it points to a zone with no hostid emulation, then the machine's
3501 3528   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3502 3529   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3503 3530   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3504 3531   * hostid and the machine's hostid is invalid.
3505 3532   */
3506 3533  uint32_t
3507 3534  zone_get_hostid(zone_t *zonep)
3508 3535  {
3509 3536          unsigned long machine_hostid;
3510 3537  
3511 3538          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3512 3539                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3513 3540                          return (HW_INVALID_HOSTID);
3514 3541                  return ((uint32_t)machine_hostid);
3515 3542          }
3516 3543          return (zonep->zone_hostid);
3517 3544  }
3518 3545  
3519 3546  /*
3520 3547   * Similar to thread_create(), but makes sure the thread is in the appropriate
3521 3548   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3522 3549   */
3523 3550  /*ARGSUSED*/
3524 3551  kthread_t *
3525 3552  zthread_create(
3526 3553      caddr_t stk,
3527 3554      size_t stksize,
3528 3555      void (*proc)(),
3529 3556      void *arg,
3530 3557      size_t len,
3531 3558      pri_t pri)
3532 3559  {
3533 3560          kthread_t *t;
3534 3561          zone_t *zone = curproc->p_zone;
3535 3562          proc_t *pp = zone->zone_zsched;
3536 3563  
3537 3564          zone_hold(zone);        /* Reference to be dropped when thread exits */
3538 3565  
3539 3566          /*
3540 3567           * No-one should be trying to create threads if the zone is shutting
3541 3568           * down and there aren't any kernel threads around.  See comment
3542 3569           * in zthread_exit().
3543 3570           */
3544 3571          ASSERT(!(zone->zone_kthreads == NULL &&
3545 3572              zone_status_get(zone) >= ZONE_IS_EMPTY));
3546 3573          /*
3547 3574           * Create a thread, but don't let it run until we've finished setting
3548 3575           * things up.
3549 3576           */
3550 3577          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3551 3578          ASSERT(t->t_forw == NULL);
3552 3579          mutex_enter(&zone_status_lock);
3553 3580          if (zone->zone_kthreads == NULL) {
3554 3581                  t->t_forw = t->t_back = t;
3555 3582          } else {
3556 3583                  kthread_t *tx = zone->zone_kthreads;
3557 3584  
3558 3585                  t->t_forw = tx;
3559 3586                  t->t_back = tx->t_back;
3560 3587                  tx->t_back->t_forw = t;
3561 3588                  tx->t_back = t;
3562 3589          }
3563 3590          zone->zone_kthreads = t;
3564 3591          mutex_exit(&zone_status_lock);
3565 3592  
3566 3593          mutex_enter(&pp->p_lock);
3567 3594          t->t_proc_flag |= TP_ZTHREAD;
3568 3595          project_rele(t->t_proj);
3569 3596          t->t_proj = project_hold(pp->p_task->tk_proj);
3570 3597  
3571 3598          /*
3572 3599           * Setup complete, let it run.
3573 3600           */
3574 3601          thread_lock(t);
3575 3602          t->t_schedflag |= TS_ALLSTART;
3576 3603          setrun_locked(t);
3577 3604          thread_unlock(t);
3578 3605  
3579 3606          mutex_exit(&pp->p_lock);
3580 3607  
3581 3608          return (t);
3582 3609  }
3583 3610  
3584 3611  /*
3585 3612   * Similar to thread_exit().  Must be called by threads created via
3586 3613   * zthread_exit().
3587 3614   */
3588 3615  void
3589 3616  zthread_exit(void)
3590 3617  {
3591 3618          kthread_t *t = curthread;
3592 3619          proc_t *pp = curproc;
3593 3620          zone_t *zone = pp->p_zone;
3594 3621  
3595 3622          mutex_enter(&zone_status_lock);
3596 3623  
3597 3624          /*
3598 3625           * Reparent to p0
3599 3626           */
3600 3627          kpreempt_disable();
3601 3628          mutex_enter(&pp->p_lock);
3602 3629          t->t_proc_flag &= ~TP_ZTHREAD;
3603 3630          t->t_procp = &p0;
3604 3631          hat_thread_exit(t);
3605 3632          mutex_exit(&pp->p_lock);
3606 3633          kpreempt_enable();
3607 3634  
3608 3635          if (t->t_back == t) {
3609 3636                  ASSERT(t->t_forw == t);
3610 3637                  /*
3611 3638                   * If the zone is empty, once the thread count
3612 3639                   * goes to zero no further kernel threads can be
3613 3640                   * created.  This is because if the creator is a process
3614 3641                   * in the zone, then it must have exited before the zone
3615 3642                   * state could be set to ZONE_IS_EMPTY.
3616 3643                   * Otherwise, if the creator is a kernel thread in the
3617 3644                   * zone, the thread count is non-zero.
3618 3645                   *
3619 3646                   * This really means that non-zone kernel threads should
3620 3647                   * not create zone kernel threads.
3621 3648                   */
3622 3649                  zone->zone_kthreads = NULL;
3623 3650                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3624 3651                          zone_status_set(zone, ZONE_IS_DOWN);
3625 3652                          /*
3626 3653                           * Remove any CPU caps on this zone.
3627 3654                           */
3628 3655                          cpucaps_zone_remove(zone);
3629 3656                  }
3630 3657          } else {
3631 3658                  t->t_forw->t_back = t->t_back;
3632 3659                  t->t_back->t_forw = t->t_forw;
3633 3660                  if (zone->zone_kthreads == t)
3634 3661                          zone->zone_kthreads = t->t_forw;
3635 3662          }
3636 3663          mutex_exit(&zone_status_lock);
3637 3664          zone_rele(zone);
3638 3665          thread_exit();
3639 3666          /* NOTREACHED */
3640 3667  }
3641 3668  
3642 3669  static void
3643 3670  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3644 3671  {
3645 3672          vnode_t *oldvp;
3646 3673  
3647 3674          /* we're going to hold a reference here to the directory */
3648 3675          VN_HOLD(vp);
3649 3676  
3650 3677          /* update abs cwd/root path see c2/audit.c */
3651 3678          if (AU_AUDITING())
3652 3679                  audit_chdirec(vp, vpp);
3653 3680  
3654 3681          mutex_enter(&pp->p_lock);
3655 3682          oldvp = *vpp;
3656 3683          *vpp = vp;
3657 3684          mutex_exit(&pp->p_lock);
3658 3685          if (oldvp != NULL)
3659 3686                  VN_RELE(oldvp);
3660 3687  }
3661 3688  
3662 3689  /*
3663 3690   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3664 3691   */
3665 3692  static int
3666 3693  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3667 3694  {
3668 3695          nvpair_t *nvp = NULL;
3669 3696          boolean_t priv_set = B_FALSE;
3670 3697          boolean_t limit_set = B_FALSE;
3671 3698          boolean_t action_set = B_FALSE;
3672 3699  
3673 3700          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3674 3701                  const char *name;
3675 3702                  uint64_t ui64;
3676 3703  
3677 3704                  name = nvpair_name(nvp);
3678 3705                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3679 3706                          return (EINVAL);
3680 3707                  (void) nvpair_value_uint64(nvp, &ui64);
3681 3708                  if (strcmp(name, "privilege") == 0) {
3682 3709                          /*
3683 3710                           * Currently only privileged values are allowed, but
3684 3711                           * this may change in the future.
3685 3712                           */
3686 3713                          if (ui64 != RCPRIV_PRIVILEGED)
3687 3714                                  return (EINVAL);
3688 3715                          rv->rcv_privilege = ui64;
3689 3716                          priv_set = B_TRUE;
3690 3717                  } else if (strcmp(name, "limit") == 0) {
3691 3718                          rv->rcv_value = ui64;
3692 3719                          limit_set = B_TRUE;
3693 3720                  } else if (strcmp(name, "action") == 0) {
3694 3721                          if (ui64 != RCTL_LOCAL_NOACTION &&
3695 3722                              ui64 != RCTL_LOCAL_DENY)
3696 3723                                  return (EINVAL);
3697 3724                          rv->rcv_flagaction = ui64;
3698 3725                          action_set = B_TRUE;
3699 3726                  } else {
3700 3727                          return (EINVAL);
3701 3728                  }
3702 3729          }
3703 3730  
3704 3731          if (!(priv_set && limit_set && action_set))
3705 3732                  return (EINVAL);
3706 3733          rv->rcv_action_signal = 0;
3707 3734          rv->rcv_action_recipient = NULL;
3708 3735          rv->rcv_action_recip_pid = -1;
3709 3736          rv->rcv_firing_time = 0;
3710 3737  
3711 3738          return (0);
3712 3739  }
3713 3740  
3714 3741  /*
3715 3742   * Non-global zone version of start_init.
3716 3743   */
3717 3744  void
3718 3745  zone_start_init(void)
3719 3746  {
3720 3747          proc_t *p = ttoproc(curthread);
3721 3748          zone_t *z = p->p_zone;
3722 3749  
3723 3750          ASSERT(!INGLOBALZONE(curproc));
3724 3751  
3725 3752          /*
3726 3753           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3727 3754           * storing just the pid of init is sufficient.
3728 3755           */
3729 3756          z->zone_proc_initpid = p->p_pid;
3730 3757  
3731 3758          /*
3732 3759           * We maintain zone_boot_err so that we can return the cause of the
3733 3760           * failure back to the caller of the zone_boot syscall.
3734 3761           */
3735 3762          p->p_zone->zone_boot_err = start_init_common();
3736 3763  
3737 3764          /*
3738 3765           * We will prevent booting zones from becoming running zones if the
3739 3766           * global zone is shutting down.
3740 3767           */
3741 3768          mutex_enter(&zone_status_lock);
3742 3769          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3743 3770              ZONE_IS_SHUTTING_DOWN) {
3744 3771                  /*
3745 3772                   * Make sure we are still in the booting state-- we could have
3746 3773                   * raced and already be shutting down, or even further along.
3747 3774                   */
3748 3775                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3749 3776                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3750 3777                  }
3751 3778                  mutex_exit(&zone_status_lock);
3752 3779                  /* It's gone bad, dispose of the process */
3753 3780                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3754 3781                          mutex_enter(&p->p_lock);
3755 3782                          ASSERT(p->p_flag & SEXITLWPS);
3756 3783                          lwp_exit();
3757 3784                  }
3758 3785          } else {
3759 3786                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3760 3787                          zone_status_set(z, ZONE_IS_RUNNING);
3761 3788                  mutex_exit(&zone_status_lock);
3762 3789                  /* cause the process to return to userland. */
3763 3790                  lwp_rtt();
3764 3791          }
3765 3792  }
3766 3793  
3767 3794  struct zsched_arg {
3768 3795          zone_t *zone;
3769 3796          nvlist_t *nvlist;
3770 3797  };
3771 3798  
3772 3799  /*
3773 3800   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3774 3801   * anything to do with scheduling, but rather with the fact that
3775 3802   * per-zone kernel threads are parented to zsched, just like regular
3776 3803   * kernel threads are parented to sched (p0).
3777 3804   *
3778 3805   * zsched is also responsible for launching init for the zone.
3779 3806   */
3780 3807  static void
3781 3808  zsched(void *arg)
3782 3809  {
3783 3810          struct zsched_arg *za = arg;
3784 3811          proc_t *pp = curproc;
3785 3812          proc_t *initp = proc_init;
3786 3813          zone_t *zone = za->zone;
3787 3814          cred_t *cr, *oldcred;
3788 3815          rctl_set_t *set;
3789 3816          rctl_alloc_gp_t *gp;
3790 3817          contract_t *ct = NULL;
3791 3818          task_t *tk, *oldtk;
3792 3819          rctl_entity_p_t e;
3793 3820          kproject_t *pj;
3794 3821  
3795 3822          nvlist_t *nvl = za->nvlist;
3796 3823          nvpair_t *nvp = NULL;
3797 3824  
3798 3825          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3799 3826          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3800 3827          PTOU(pp)->u_argc = 0;
3801 3828          PTOU(pp)->u_argv = NULL;
3802 3829          PTOU(pp)->u_envp = NULL;
3803 3830          closeall(P_FINFO(pp));
3804 3831  
3805 3832          /*
3806 3833           * We are this zone's "zsched" process.  As the zone isn't generally
3807 3834           * visible yet we don't need to grab any locks before initializing its
3808 3835           * zone_proc pointer.
3809 3836           */
3810 3837          zone_hold(zone);  /* this hold is released by zone_destroy() */
3811 3838          zone->zone_zsched = pp;
3812 3839          mutex_enter(&pp->p_lock);
3813 3840          pp->p_zone = zone;
3814 3841          mutex_exit(&pp->p_lock);
3815 3842  
3816 3843          /*
3817 3844           * Disassociate process from its 'parent'; parent ourselves to init
3818 3845           * (pid 1) and change other values as needed.
3819 3846           */
3820 3847          sess_create();
3821 3848  
3822 3849          mutex_enter(&pidlock);
3823 3850          proc_detach(pp);
3824 3851          pp->p_ppid = 1;
3825 3852          pp->p_flag |= SZONETOP;
3826 3853          pp->p_ancpid = 1;
3827 3854          pp->p_parent = initp;
3828 3855          pp->p_psibling = NULL;
3829 3856          if (initp->p_child)
3830 3857                  initp->p_child->p_psibling = pp;
3831 3858          pp->p_sibling = initp->p_child;
3832 3859          initp->p_child = pp;
3833 3860  
3834 3861          /* Decrement what newproc() incremented. */
3835 3862          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3836 3863          /*
3837 3864           * Our credentials are about to become kcred-like, so we don't care
3838 3865           * about the caller's ruid.
3839 3866           */
3840 3867          upcount_inc(crgetruid(kcred), zone->zone_id);
3841 3868          mutex_exit(&pidlock);
3842 3869  
3843 3870          /*
3844 3871           * getting out of global zone, so decrement lwp and process counts
3845 3872           */
3846 3873          pj = pp->p_task->tk_proj;
3847 3874          mutex_enter(&global_zone->zone_nlwps_lock);
3848 3875          pj->kpj_nlwps -= pp->p_lwpcnt;
3849 3876          global_zone->zone_nlwps -= pp->p_lwpcnt;
3850 3877          pj->kpj_nprocs--;
3851 3878          global_zone->zone_nprocs--;
3852 3879          mutex_exit(&global_zone->zone_nlwps_lock);
3853 3880  
3854 3881          /*
3855 3882           * Decrement locked memory counts on old zone and project.
3856 3883           */
3857 3884          mutex_enter(&global_zone->zone_mem_lock);
3858 3885          global_zone->zone_locked_mem -= pp->p_locked_mem;
3859 3886          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3860 3887          mutex_exit(&global_zone->zone_mem_lock);
3861 3888  
3862 3889          /*
3863 3890           * Create and join a new task in project '0' of this zone.
3864 3891           *
3865 3892           * We don't need to call holdlwps() since we know we're the only lwp in
3866 3893           * this process.
3867 3894           *
3868 3895           * task_join() returns with p_lock held.
3869 3896           */
3870 3897          tk = task_create(0, zone);
3871 3898          mutex_enter(&cpu_lock);
3872 3899          oldtk = task_join(tk, 0);
3873 3900  
3874 3901          pj = pp->p_task->tk_proj;
3875 3902  
3876 3903          mutex_enter(&zone->zone_mem_lock);
3877 3904          zone->zone_locked_mem += pp->p_locked_mem;
3878 3905          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3879 3906          mutex_exit(&zone->zone_mem_lock);
3880 3907  
3881 3908          /*
3882 3909           * add lwp and process counts to zsched's zone, and increment
3883 3910           * project's task and process count due to the task created in
3884 3911           * the above task_create.
3885 3912           */
3886 3913          mutex_enter(&zone->zone_nlwps_lock);
3887 3914          pj->kpj_nlwps += pp->p_lwpcnt;
3888 3915          pj->kpj_ntasks += 1;
3889 3916          zone->zone_nlwps += pp->p_lwpcnt;
3890 3917          pj->kpj_nprocs++;
3891 3918          zone->zone_nprocs++;
3892 3919          mutex_exit(&zone->zone_nlwps_lock);
3893 3920  
3894 3921          mutex_exit(&curproc->p_lock);
3895 3922          mutex_exit(&cpu_lock);
3896 3923          task_rele(oldtk);
3897 3924  
3898 3925          /*
3899 3926           * The process was created by a process in the global zone, hence the
3900 3927           * credentials are wrong.  We might as well have kcred-ish credentials.
3901 3928           */
3902 3929          cr = zone->zone_kcred;
3903 3930          crhold(cr);
3904 3931          mutex_enter(&pp->p_crlock);
3905 3932          oldcred = pp->p_cred;
3906 3933          pp->p_cred = cr;
3907 3934          mutex_exit(&pp->p_crlock);
3908 3935          crfree(oldcred);
3909 3936  
3910 3937          /*
3911 3938           * Hold credentials again (for thread)
3912 3939           */
3913 3940          crhold(cr);
3914 3941  
3915 3942          /*
3916 3943           * p_lwpcnt can't change since this is a kernel process.
3917 3944           */
3918 3945          crset(pp, cr);
3919 3946  
3920 3947          /*
3921 3948           * Chroot
3922 3949           */
3923 3950          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3924 3951          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3925 3952  
3926 3953          /*
3927 3954           * Initialize zone's rctl set.
3928 3955           */
3929 3956          set = rctl_set_create();
3930 3957          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3931 3958          mutex_enter(&pp->p_lock);
3932 3959          e.rcep_p.zone = zone;
3933 3960          e.rcep_t = RCENTITY_ZONE;
3934 3961          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3935 3962          mutex_exit(&pp->p_lock);
3936 3963          rctl_prealloc_destroy(gp);
3937 3964  
3938 3965          /*
3939 3966           * Apply the rctls passed in to zone_create().  This is basically a list
3940 3967           * assignment: all of the old values are removed and the new ones
3941 3968           * inserted.  That is, if an empty list is passed in, all values are
3942 3969           * removed.
3943 3970           */
3944 3971          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3945 3972                  rctl_dict_entry_t *rde;
3946 3973                  rctl_hndl_t hndl;
3947 3974                  char *name;
3948 3975                  nvlist_t **nvlarray;
3949 3976                  uint_t i, nelem;
3950 3977                  int error;      /* For ASSERT()s */
3951 3978  
3952 3979                  name = nvpair_name(nvp);
3953 3980                  hndl = rctl_hndl_lookup(name);
3954 3981                  ASSERT(hndl != -1);
3955 3982                  rde = rctl_dict_lookup_hndl(hndl);
3956 3983                  ASSERT(rde != NULL);
3957 3984  
3958 3985                  for (; /* ever */; ) {
3959 3986                          rctl_val_t oval;
3960 3987  
3961 3988                          mutex_enter(&pp->p_lock);
3962 3989                          error = rctl_local_get(hndl, NULL, &oval, pp);
3963 3990                          mutex_exit(&pp->p_lock);
3964 3991                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3965 3992                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3966 3993                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
3967 3994                                  break;
3968 3995                          mutex_enter(&pp->p_lock);
3969 3996                          error = rctl_local_delete(hndl, &oval, pp);
3970 3997                          mutex_exit(&pp->p_lock);
3971 3998                          ASSERT(error == 0);
3972 3999                  }
3973 4000                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3974 4001                  ASSERT(error == 0);
3975 4002                  for (i = 0; i < nelem; i++) {
3976 4003                          rctl_val_t *nvalp;
3977 4004  
3978 4005                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3979 4006                          error = nvlist2rctlval(nvlarray[i], nvalp);
3980 4007                          ASSERT(error == 0);

↓ open down ↓

1441 lines elided

↑ open up ↑

3981 4008                          /*
3982 4009                           * rctl_local_insert can fail if the value being
3983 4010                           * inserted is a duplicate; this is OK.
3984 4011                           */
3985 4012                          mutex_enter(&pp->p_lock);
3986 4013                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
3987 4014                                  kmem_cache_free(rctl_val_cache, nvalp);
3988 4015                          mutex_exit(&pp->p_lock);
3989 4016                  }
3990 4017          }
     4018 +
3991 4019          /*
3992 4020           * Tell the world that we're done setting up.
3993 4021           *
3994 4022           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3995 4023           * and atomically set the zone's processor set visibility.  Once
3996 4024           * we drop pool_lock() this zone will automatically get updated
3997 4025           * to reflect any future changes to the pools configuration.
3998 4026           *
3999 4027           * Note that after we drop the locks below (zonehash_lock in
4000 4028           * particular) other operations such as a zone_getattr call can

4001 4029           * now proceed and observe the zone. That is the reason for doing a
4002 4030           * state transition to the INITIALIZED state.
4003 4031           */
4004 4032          pool_lock();
4005 4033          mutex_enter(&cpu_lock);
4006 4034          mutex_enter(&zonehash_lock);
4007 4035          zone_uniqid(zone);
4008 4036          zone_zsd_configure(zone);
4009 4037          if (pool_state == POOL_ENABLED)
4010 4038                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
4011 4039          mutex_enter(&zone_status_lock);
4012 4040          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4013 4041          zone_status_set(zone, ZONE_IS_INITIALIZED);
4014 4042          mutex_exit(&zone_status_lock);
4015 4043          mutex_exit(&zonehash_lock);
4016 4044          mutex_exit(&cpu_lock);
4017 4045          pool_unlock();
4018 4046  
4019 4047          /* Now call the create callback for this key */
4020 4048          zsd_apply_all_keys(zsd_apply_create, zone);
4021 4049  
4022 4050          /* The callbacks are complete. Mark ZONE_IS_READY */
4023 4051          mutex_enter(&zone_status_lock);
4024 4052          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4025 4053          zone_status_set(zone, ZONE_IS_READY);
4026 4054          mutex_exit(&zone_status_lock);
4027 4055  
4028 4056          /*
4029 4057           * Once we see the zone transition to the ZONE_IS_BOOTING state,
4030 4058           * we launch init, and set the state to running.
4031 4059           */
4032 4060          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4033 4061  
4034 4062          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4035 4063                  id_t cid;
4036 4064  
4037 4065                  /*
4038 4066                   * Ok, this is a little complicated.  We need to grab the
4039 4067                   * zone's pool's scheduling class ID; note that by now, we
4040 4068                   * are already bound to a pool if we need to be (zoneadmd
4041 4069                   * will have done that to us while we're in the READY
4042 4070                   * state).  *But* the scheduling class for the zone's 'init'
4043 4071                   * must be explicitly passed to newproc, which doesn't
4044 4072                   * respect pool bindings.
4045 4073                   *
4046 4074                   * We hold the pool_lock across the call to newproc() to
4047 4075                   * close the obvious race: the pool's scheduling class
4048 4076                   * could change before we manage to create the LWP with
4049 4077                   * classid 'cid'.
4050 4078                   */
4051 4079                  pool_lock();
4052 4080                  if (zone->zone_defaultcid > 0)
4053 4081                          cid = zone->zone_defaultcid;
4054 4082                  else
4055 4083                          cid = pool_get_class(zone->zone_pool);
4056 4084                  if (cid == -1)
4057 4085                          cid = defaultcid;
4058 4086  
4059 4087                  /*
4060 4088                   * If this fails, zone_boot will ultimately fail.  The
4061 4089                   * state of the zone will be set to SHUTTING_DOWN-- userland
4062 4090                   * will have to tear down the zone, and fail, or try again.
4063 4091                   */
4064 4092                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4065 4093                      minclsyspri - 1, &ct, 0)) != 0) {
4066 4094                          mutex_enter(&zone_status_lock);
4067 4095                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4068 4096                          mutex_exit(&zone_status_lock);
4069 4097                  } else {
4070 4098                          zone->zone_boot_time = gethrestime_sec();
4071 4099                  }
4072 4100  
4073 4101                  pool_unlock();
4074 4102          }
4075 4103  
4076 4104          /*
4077 4105           * Wait for zone_destroy() to be called.  This is what we spend
4078 4106           * most of our life doing.
4079 4107           */
4080 4108          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4081 4109  
4082 4110          if (ct)
4083 4111                  /*
4084 4112                   * At this point the process contract should be empty.
4085 4113                   * (Though if it isn't, it's not the end of the world.)
4086 4114                   */
4087 4115                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4088 4116  
4089 4117          /*
4090 4118           * Allow kcred to be freed when all referring processes
4091 4119           * (including this one) go away.  We can't just do this in
4092 4120           * zone_free because we need to wait for the zone_cred_ref to
4093 4121           * drop to 0 before calling zone_free, and the existence of
4094 4122           * zone_kcred will prevent that.  Thus, we call crfree here to
4095 4123           * balance the crdup in zone_create.  The crhold calls earlier
4096 4124           * in zsched will be dropped when the thread and process exit.
4097 4125           */
4098 4126          crfree(zone->zone_kcred);
4099 4127          zone->zone_kcred = NULL;
4100 4128  
4101 4129          exit(CLD_EXITED, 0);
4102 4130  }
4103 4131  
4104 4132  /*
4105 4133   * Helper function to determine if there are any submounts of the
4106 4134   * provided path.  Used to make sure the zone doesn't "inherit" any
4107 4135   * mounts from before it is created.
4108 4136   */
4109 4137  static uint_t
4110 4138  zone_mount_count(const char *rootpath)
4111 4139  {
4112 4140          vfs_t *vfsp;
4113 4141          uint_t count = 0;
4114 4142          size_t rootpathlen = strlen(rootpath);
4115 4143  
4116 4144          /*
4117 4145           * Holding zonehash_lock prevents race conditions with
4118 4146           * vfs_list_add()/vfs_list_remove() since we serialize with
4119 4147           * zone_find_by_path().
4120 4148           */
4121 4149          ASSERT(MUTEX_HELD(&zonehash_lock));
4122 4150          /*
4123 4151           * The rootpath must end with a '/'
4124 4152           */
4125 4153          ASSERT(rootpath[rootpathlen - 1] == '/');
4126 4154  
4127 4155          /*
4128 4156           * This intentionally does not count the rootpath itself if that
4129 4157           * happens to be a mount point.
4130 4158           */
4131 4159          vfs_list_read_lock();
4132 4160          vfsp = rootvfs;
4133 4161          do {
4134 4162                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4135 4163                      rootpathlen) == 0)
4136 4164                          count++;
4137 4165                  vfsp = vfsp->vfs_next;
4138 4166          } while (vfsp != rootvfs);
4139 4167          vfs_list_unlock();
4140 4168          return (count);
4141 4169  }
4142 4170  
4143 4171  /*
4144 4172   * Helper function to make sure that a zone created on 'rootpath'
4145 4173   * wouldn't end up containing other zones' rootpaths.
4146 4174   */
4147 4175  static boolean_t
4148 4176  zone_is_nested(const char *rootpath)
4149 4177  {
4150 4178          zone_t *zone;
4151 4179          size_t rootpathlen = strlen(rootpath);
4152 4180          size_t len;
4153 4181  
4154 4182          ASSERT(MUTEX_HELD(&zonehash_lock));
4155 4183  
4156 4184          /*
4157 4185           * zone_set_root() appended '/' and '\0' at the end of rootpath
4158 4186           */
4159 4187          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4160 4188              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4161 4189                  return (B_TRUE);
4162 4190  
4163 4191          for (zone = list_head(&zone_active); zone != NULL;
4164 4192              zone = list_next(&zone_active, zone)) {
4165 4193                  if (zone == global_zone)
4166 4194                          continue;
4167 4195                  len = strlen(zone->zone_rootpath);
4168 4196                  if (strncmp(rootpath, zone->zone_rootpath,
4169 4197                      MIN(rootpathlen, len)) == 0)
4170 4198                          return (B_TRUE);
4171 4199          }
4172 4200          return (B_FALSE);
4173 4201  }
4174 4202  
4175 4203  static int
4176 4204  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4177 4205      size_t zone_privssz)
4178 4206  {
4179 4207          priv_set_t *privs;
4180 4208  
4181 4209          if (zone_privssz < sizeof (priv_set_t))
4182 4210                  return (ENOMEM);
4183 4211  
4184 4212          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4185 4213  
4186 4214          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4187 4215                  kmem_free(privs, sizeof (priv_set_t));
4188 4216                  return (EFAULT);
4189 4217          }
4190 4218  
4191 4219          zone->zone_privset = privs;
4192 4220          return (0);
4193 4221  }
4194 4222  
4195 4223  /*
4196 4224   * We make creative use of nvlists to pass in rctls from userland.  The list is
4197 4225   * a list of the following structures:
4198 4226   *
4199 4227   * (name = rctl_name, value = nvpair_list_array)
4200 4228   *
4201 4229   * Where each element of the nvpair_list_array is of the form:
4202 4230   *
4203 4231   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4204 4232   *      (name = "limit", value = uint64_t),
4205 4233   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4206 4234   */
4207 4235  static int
4208 4236  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4209 4237  {
4210 4238          nvpair_t *nvp = NULL;
4211 4239          nvlist_t *nvl = NULL;
4212 4240          char *kbuf;
4213 4241          int error;
4214 4242          rctl_val_t rv;
4215 4243  
4216 4244          *nvlp = NULL;
4217 4245  
4218 4246          if (buflen == 0)
4219 4247                  return (0);
4220 4248  
4221 4249          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4222 4250                  return (ENOMEM);
4223 4251          if (copyin(ubuf, kbuf, buflen)) {
4224 4252                  error = EFAULT;
4225 4253                  goto out;
4226 4254          }
4227 4255          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4228 4256                  /*
4229 4257                   * nvl may have been allocated/free'd, but the value set to
4230 4258                   * non-NULL, so we reset it here.
4231 4259                   */
4232 4260                  nvl = NULL;
4233 4261                  error = EINVAL;
4234 4262                  goto out;
4235 4263          }
4236 4264          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4237 4265                  rctl_dict_entry_t *rde;
4238 4266                  rctl_hndl_t hndl;
4239 4267                  nvlist_t **nvlarray;
4240 4268                  uint_t i, nelem;
4241 4269                  char *name;
4242 4270  
4243 4271                  error = EINVAL;
4244 4272                  name = nvpair_name(nvp);
4245 4273                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4246 4274                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4247 4275                          goto out;
4248 4276                  }
4249 4277                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4250 4278                          goto out;
4251 4279                  }
4252 4280                  rde = rctl_dict_lookup_hndl(hndl);
4253 4281                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4254 4282                  ASSERT(error == 0);
4255 4283                  for (i = 0; i < nelem; i++) {
4256 4284                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4257 4285                                  goto out;
4258 4286                  }
4259 4287                  if (rctl_invalid_value(rde, &rv)) {
4260 4288                          error = EINVAL;
4261 4289                          goto out;
4262 4290                  }
4263 4291          }

↓ open down ↓

263 lines elided

↑ open up ↑

4264 4292          error = 0;
4265 4293          *nvlp = nvl;
4266 4294  out:
4267 4295          kmem_free(kbuf, buflen);
4268 4296          if (error && nvl != NULL)
4269 4297                  nvlist_free(nvl);
4270 4298          return (error);
4271 4299  }
4272 4300  
4273 4301  int
4274      -zone_create_error(int er_error, int er_ext, int *er_out) {
     4302 +zone_create_error(int er_error, int er_ext, int *er_out)
     4303 +{
4275 4304          if (er_out != NULL) {
4276 4305                  if (copyout(&er_ext, er_out, sizeof (int))) {
4277 4306                          return (set_errno(EFAULT));
4278 4307                  }
4279 4308          }
4280 4309          return (set_errno(er_error));
4281 4310  }
4282 4311  
4283 4312  static int
4284 4313  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)

4285 4314  {
4286 4315          ts_label_t *tsl;
4287 4316          bslabel_t blab;
4288 4317  
4289 4318          /* Get label from user */
4290 4319          if (copyin(lab, &blab, sizeof (blab)) != 0)
4291 4320                  return (EFAULT);
4292 4321          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4293 4322          if (tsl == NULL)
4294 4323                  return (ENOMEM);
4295 4324  
4296 4325          zone->zone_slabel = tsl;
4297 4326          return (0);
4298 4327  }
4299 4328  
4300 4329  /*
4301 4330   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4302 4331   */
4303 4332  static int
4304 4333  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4305 4334  {
4306 4335          char *kbuf;
4307 4336          char *dataset, *next;
4308 4337          zone_dataset_t *zd;
4309 4338          size_t len;
4310 4339  
4311 4340          if (ubuf == NULL || buflen == 0)
4312 4341                  return (0);
4313 4342  
4314 4343          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4315 4344                  return (ENOMEM);
4316 4345  
4317 4346          if (copyin(ubuf, kbuf, buflen) != 0) {
4318 4347                  kmem_free(kbuf, buflen);
4319 4348                  return (EFAULT);
4320 4349          }
4321 4350  
4322 4351          dataset = next = kbuf;
4323 4352          for (;;) {
4324 4353                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4325 4354  
4326 4355                  next = strchr(dataset, ',');
4327 4356  
4328 4357                  if (next == NULL)
4329 4358                          len = strlen(dataset);
4330 4359                  else
4331 4360                          len = next - dataset;
4332 4361  
4333 4362                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4334 4363                  bcopy(dataset, zd->zd_dataset, len);
4335 4364                  zd->zd_dataset[len] = '\0';
4336 4365  
4337 4366                  list_insert_head(&zone->zone_datasets, zd);
4338 4367  
4339 4368                  if (next == NULL)
4340 4369                          break;
4341 4370  
4342 4371                  dataset = next + 1;
4343 4372          }
4344 4373  
4345 4374          kmem_free(kbuf, buflen);
4346 4375          return (0);
4347 4376  }
4348 4377  
4349 4378  /*
4350 4379   * System call to create/initialize a new zone named 'zone_name', rooted
4351 4380   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4352 4381   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4353 4382   * with labeling set by 'match', 'doi', and 'label'.
4354 4383   *
4355 4384   * If extended error is non-null, we may use it to return more detailed
4356 4385   * error information.
4357 4386   */
4358 4387  static zoneid_t
4359 4388  zone_create(const char *zone_name, const char *zone_root,
4360 4389      const priv_set_t *zone_privs, size_t zone_privssz,
4361 4390      caddr_t rctlbuf, size_t rctlbufsz,
4362 4391      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4363 4392      int match, uint32_t doi, const bslabel_t *label,
4364 4393      int flags)
4365 4394  {
4366 4395          struct zsched_arg zarg;
4367 4396          nvlist_t *rctls = NULL;
4368 4397          proc_t *pp = curproc;
4369 4398          zone_t *zone, *ztmp;
4370 4399          zoneid_t zoneid;
4371 4400          int error;
4372 4401          int error2 = 0;
4373 4402          char *str;
4374 4403          cred_t *zkcr;
4375 4404          boolean_t insert_label_hash;
4376 4405  
4377 4406          if (secpolicy_zone_config(CRED()) != 0)
4378 4407                  return (set_errno(EPERM));
4379 4408  
4380 4409          /* can't boot zone from within chroot environment */
4381 4410          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4382 4411                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4383 4412                      extended_error));
4384 4413  
4385 4414          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4386 4415          zoneid = zone->zone_id = id_alloc(zoneid_space);
4387 4416          zone->zone_status = ZONE_IS_UNINITIALIZED;
4388 4417          zone->zone_pool = pool_default;
4389 4418          zone->zone_pool_mod = gethrtime();
4390 4419          zone->zone_psetid = ZONE_PS_INVAL;
4391 4420          zone->zone_ncpus = 0;
4392 4421          zone->zone_ncpus_online = 0;
4393 4422          zone->zone_restart_init = B_TRUE;
4394 4423          zone->zone_brand = &native_brand;
4395 4424          zone->zone_initname = NULL;
4396 4425          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4397 4426          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4398 4427          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4399 4428          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4400 4429          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4401 4430              offsetof(zone_ref_t, zref_linkage));
4402 4431          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4403 4432              offsetof(struct zsd_entry, zsd_linkage));
4404 4433          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4405 4434              offsetof(zone_dataset_t, zd_linkage));
4406 4435          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4407 4436              offsetof(zone_dl_t, zdl_linkage));
4408 4437          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4409 4438          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4410 4439  
4411 4440          if (flags & ZCF_NET_EXCL) {
4412 4441                  zone->zone_flags |= ZF_NET_EXCL;
4413 4442          }
4414 4443  
4415 4444          if ((error = zone_set_name(zone, zone_name)) != 0) {
4416 4445                  zone_free(zone);
4417 4446                  return (zone_create_error(error, 0, extended_error));
4418 4447          }
4419 4448  
4420 4449          if ((error = zone_set_root(zone, zone_root)) != 0) {
4421 4450                  zone_free(zone);
4422 4451                  return (zone_create_error(error, 0, extended_error));
4423 4452          }
4424 4453          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4425 4454                  zone_free(zone);
4426 4455                  return (zone_create_error(error, 0, extended_error));
4427 4456          }
4428 4457  
4429 4458          /* initialize node name to be the same as zone name */
4430 4459          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4431 4460          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4432 4461          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4433 4462

↓ open down ↓

149 lines elided

↑ open up ↑

4434 4463          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4435 4464          zone->zone_domain[0] = '\0';
4436 4465          zone->zone_hostid = HW_INVALID_HOSTID;
4437 4466          zone->zone_shares = 1;
4438 4467          zone->zone_shmmax = 0;
4439 4468          zone->zone_ipc.ipcq_shmmni = 0;
4440 4469          zone->zone_ipc.ipcq_semmni = 0;
4441 4470          zone->zone_ipc.ipcq_msgmni = 0;
4442 4471          zone->zone_bootargs = NULL;
4443 4472          zone->zone_fs_allowed = NULL;
     4473 +
     4474 +        secflags_zero(&zone0.zone_secflags.psf_lower);
     4475 +        secflags_zero(&zone0.zone_secflags.psf_effective);
     4476 +        secflags_zero(&zone0.zone_secflags.psf_inherit);
     4477 +        secflags_fullset(&zone0.zone_secflags.psf_upper);
     4478 +
4444 4479          zone->zone_initname =
4445 4480              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4446 4481          (void) strcpy(zone->zone_initname, zone_default_initname);
4447 4482          zone->zone_nlwps = 0;
4448 4483          zone->zone_nlwps_ctl = INT_MAX;
4449 4484          zone->zone_nprocs = 0;
4450 4485          zone->zone_nprocs_ctl = INT_MAX;
4451 4486          zone->zone_locked_mem = 0;
4452 4487          zone->zone_locked_mem_ctl = UINT64_MAX;
4453 4488          zone->zone_max_swap = 0;

4454 4489          zone->zone_max_swap_ctl = UINT64_MAX;
4455 4490          zone->zone_max_lofi = 0;
4456 4491          zone->zone_max_lofi_ctl = UINT64_MAX;
4457 4492          zone0.zone_lockedmem_kstat = NULL;
4458 4493          zone0.zone_swapresv_kstat = NULL;
4459 4494  
4460 4495          /*
4461 4496           * Zsched initializes the rctls.
4462 4497           */
4463 4498          zone->zone_rctls = NULL;
4464 4499  
4465 4500          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4466 4501                  zone_free(zone);
4467 4502                  return (zone_create_error(error, 0, extended_error));
4468 4503          }
4469 4504  
4470 4505          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4471 4506                  zone_free(zone);
4472 4507                  return (set_errno(error));
4473 4508          }
4474 4509  
4475 4510          /*
4476 4511           * Read in the trusted system parameters:
4477 4512           * match flag and sensitivity label.
4478 4513           */
4479 4514          zone->zone_match = match;
4480 4515          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4481 4516                  /* Fail if requested to set doi to anything but system's doi */
4482 4517                  if (doi != 0 && doi != default_doi) {
4483 4518                          zone_free(zone);
4484 4519                          return (set_errno(EINVAL));
4485 4520                  }
4486 4521                  /* Always apply system's doi to the zone */
4487 4522                  error = zone_set_label(zone, label, default_doi);
4488 4523                  if (error != 0) {
4489 4524                          zone_free(zone);
4490 4525                          return (set_errno(error));
4491 4526                  }
4492 4527                  insert_label_hash = B_TRUE;
4493 4528          } else {
4494 4529                  /* all zones get an admin_low label if system is not labeled */
4495 4530                  zone->zone_slabel = l_admin_low;
4496 4531                  label_hold(l_admin_low);
4497 4532                  insert_label_hash = B_FALSE;
4498 4533          }
4499 4534  
4500 4535          /*
4501 4536           * Stop all lwps since that's what normally happens as part of fork().
4502 4537           * This needs to happen before we grab any locks to avoid deadlock
4503 4538           * (another lwp in the process could be waiting for the held lock).
4504 4539           */
4505 4540          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4506 4541                  zone_free(zone);
4507 4542                  nvlist_free(rctls);
4508 4543                  return (zone_create_error(error, 0, extended_error));
4509 4544          }
4510 4545  
4511 4546          if (block_mounts(zone) == 0) {
4512 4547                  mutex_enter(&pp->p_lock);
4513 4548                  if (curthread != pp->p_agenttp)
4514 4549                          continuelwps(pp);
4515 4550                  mutex_exit(&pp->p_lock);
4516 4551                  zone_free(zone);
4517 4552                  nvlist_free(rctls);
4518 4553                  return (zone_create_error(error, 0, extended_error));
4519 4554          }
4520 4555  
4521 4556          /*
4522 4557           * Set up credential for kernel access.  After this, any errors
4523 4558           * should go through the dance in errout rather than calling
4524 4559           * zone_free directly.
4525 4560           */
4526 4561          zone->zone_kcred = crdup(kcred);
4527 4562          crsetzone(zone->zone_kcred, zone);
4528 4563          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4529 4564          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4530 4565          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4531 4566          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4532 4567  
4533 4568          mutex_enter(&zonehash_lock);
4534 4569          /*
4535 4570           * Make sure zone doesn't already exist.
4536 4571           *
4537 4572           * If the system and zone are labeled,
4538 4573           * make sure no other zone exists that has the same label.
4539 4574           */
4540 4575          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4541 4576              (insert_label_hash &&
4542 4577              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4543 4578                  zone_status_t status;
4544 4579  
4545 4580                  status = zone_status_get(ztmp);
4546 4581                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4547 4582                          error = EEXIST;
4548 4583                  else
4549 4584                          error = EBUSY;
4550 4585  
4551 4586                  if (insert_label_hash)
4552 4587                          error2 = ZE_LABELINUSE;
4553 4588  
4554 4589                  goto errout;
4555 4590          }
4556 4591  
4557 4592          /*
4558 4593           * Don't allow zone creations which would cause one zone's rootpath to
4559 4594           * be accessible from that of another (non-global) zone.
4560 4595           */
4561 4596          if (zone_is_nested(zone->zone_rootpath)) {
4562 4597                  error = EBUSY;
4563 4598                  goto errout;
4564 4599          }
4565 4600  
4566 4601          ASSERT(zonecount != 0);         /* check for leaks */
4567 4602          if (zonecount + 1 > maxzones) {
4568 4603                  error = ENOMEM;
4569 4604                  goto errout;
4570 4605          }
4571 4606  
4572 4607          if (zone_mount_count(zone->zone_rootpath) != 0) {
4573 4608                  error = EBUSY;
4574 4609                  error2 = ZE_AREMOUNTS;
4575 4610                  goto errout;
4576 4611          }
4577 4612  
4578 4613          /*
4579 4614           * Zone is still incomplete, but we need to drop all locks while
4580 4615           * zsched() initializes this zone's kernel process.  We
4581 4616           * optimistically add the zone to the hashtable and associated
4582 4617           * lists so a parallel zone_create() doesn't try to create the
4583 4618           * same zone.
4584 4619           */
4585 4620          zonecount++;
4586 4621          (void) mod_hash_insert(zonehashbyid,
4587 4622              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4588 4623              (mod_hash_val_t)(uintptr_t)zone);
4589 4624          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4590 4625          (void) strcpy(str, zone->zone_name);
4591 4626          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4592 4627              (mod_hash_val_t)(uintptr_t)zone);
4593 4628          if (insert_label_hash) {
4594 4629                  (void) mod_hash_insert(zonehashbylabel,
4595 4630                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4596 4631                  zone->zone_flags |= ZF_HASHED_LABEL;
4597 4632          }
4598 4633  
4599 4634          /*
4600 4635           * Insert into active list.  At this point there are no 'hold's
4601 4636           * on the zone, but everyone else knows not to use it, so we can
4602 4637           * continue to use it.  zsched() will do a zone_hold() if the
4603 4638           * newproc() is successful.
4604 4639           */
4605 4640          list_insert_tail(&zone_active, zone);
4606 4641          mutex_exit(&zonehash_lock);
4607 4642  
4608 4643          zarg.zone = zone;
4609 4644          zarg.nvlist = rctls;
4610 4645          /*
4611 4646           * The process, task, and project rctls are probably wrong;
4612 4647           * we need an interface to get the default values of all rctls,
4613 4648           * and initialize zsched appropriately.  I'm not sure that that
4614 4649           * makes much of a difference, though.
4615 4650           */
4616 4651          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4617 4652          if (error != 0) {
4618 4653                  /*
4619 4654                   * We need to undo all globally visible state.
4620 4655                   */
4621 4656                  mutex_enter(&zonehash_lock);
4622 4657                  list_remove(&zone_active, zone);
4623 4658                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4624 4659                          ASSERT(zone->zone_slabel != NULL);
4625 4660                          (void) mod_hash_destroy(zonehashbylabel,
4626 4661                              (mod_hash_key_t)zone->zone_slabel);
4627 4662                  }
4628 4663                  (void) mod_hash_destroy(zonehashbyname,
4629 4664                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4630 4665                  (void) mod_hash_destroy(zonehashbyid,
4631 4666                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4632 4667                  ASSERT(zonecount > 1);
4633 4668                  zonecount--;
4634 4669                  goto errout;
4635 4670          }
4636 4671  
4637 4672          /*
4638 4673           * Zone creation can't fail from now on.
4639 4674           */
4640 4675  
4641 4676          /*
4642 4677           * Create zone kstats
4643 4678           */
4644 4679          zone_kstat_create(zone);
4645 4680  
4646 4681          /*
4647 4682           * Let the other lwps continue.
4648 4683           */
4649 4684          mutex_enter(&pp->p_lock);
4650 4685          if (curthread != pp->p_agenttp)
4651 4686                  continuelwps(pp);
4652 4687          mutex_exit(&pp->p_lock);
4653 4688  
4654 4689          /*
4655 4690           * Wait for zsched to finish initializing the zone.
4656 4691           */
4657 4692          zone_status_wait(zone, ZONE_IS_READY);
4658 4693          /*
4659 4694           * The zone is fully visible, so we can let mounts progress.
4660 4695           */
4661 4696          resume_mounts(zone);
4662 4697          nvlist_free(rctls);
4663 4698  
4664 4699          return (zoneid);
4665 4700  
4666 4701  errout:
4667 4702          mutex_exit(&zonehash_lock);
4668 4703          /*
4669 4704           * Let the other lwps continue.
4670 4705           */
4671 4706          mutex_enter(&pp->p_lock);
4672 4707          if (curthread != pp->p_agenttp)
4673 4708                  continuelwps(pp);
4674 4709          mutex_exit(&pp->p_lock);
4675 4710  
4676 4711          resume_mounts(zone);
4677 4712          nvlist_free(rctls);
4678 4713          /*
4679 4714           * There is currently one reference to the zone, a cred_ref from
4680 4715           * zone_kcred.  To free the zone, we call crfree, which will call
4681 4716           * zone_cred_rele, which will call zone_free.
4682 4717           */
4683 4718          ASSERT(zone->zone_cred_ref == 1);
4684 4719          ASSERT(zone->zone_kcred->cr_ref == 1);
4685 4720          ASSERT(zone->zone_ref == 0);
4686 4721          zkcr = zone->zone_kcred;
4687 4722          zone->zone_kcred = NULL;
4688 4723          crfree(zkcr);                           /* triggers call to zone_free */
4689 4724          return (zone_create_error(error, error2, extended_error));
4690 4725  }
4691 4726  
4692 4727  /*
4693 4728   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4694 4729   * the heavy lifting.  initname is the path to the program to launch
4695 4730   * at the "top" of the zone; if this is NULL, we use the system default,
4696 4731   * which is stored at zone_default_initname.
4697 4732   */
4698 4733  static int
4699 4734  zone_boot(zoneid_t zoneid)
4700 4735  {
4701 4736          int err;
4702 4737          zone_t *zone;
4703 4738  
4704 4739          if (secpolicy_zone_config(CRED()) != 0)
4705 4740                  return (set_errno(EPERM));
4706 4741          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4707 4742                  return (set_errno(EINVAL));
4708 4743  
4709 4744          mutex_enter(&zonehash_lock);
4710 4745          /*
4711 4746           * Look for zone under hash lock to prevent races with calls to
4712 4747           * zone_shutdown, zone_destroy, etc.
4713 4748           */
4714 4749          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4715 4750                  mutex_exit(&zonehash_lock);
4716 4751                  return (set_errno(EINVAL));
4717 4752          }
4718 4753  
4719 4754          mutex_enter(&zone_status_lock);
4720 4755          if (zone_status_get(zone) != ZONE_IS_READY) {
4721 4756                  mutex_exit(&zone_status_lock);
4722 4757                  mutex_exit(&zonehash_lock);
4723 4758                  return (set_errno(EINVAL));
4724 4759          }
4725 4760          zone_status_set(zone, ZONE_IS_BOOTING);
4726 4761          mutex_exit(&zone_status_lock);
4727 4762  
4728 4763          zone_hold(zone);        /* so we can use the zone_t later */
4729 4764          mutex_exit(&zonehash_lock);
4730 4765  
4731 4766          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4732 4767                  zone_rele(zone);
4733 4768                  return (set_errno(EINTR));
4734 4769          }
4735 4770  
4736 4771          /*
4737 4772           * Boot (starting init) might have failed, in which case the zone
4738 4773           * will go to the SHUTTING_DOWN state; an appropriate errno will
4739 4774           * be placed in zone->zone_boot_err, and so we return that.
4740 4775           */
4741 4776          err = zone->zone_boot_err;
4742 4777          zone_rele(zone);
4743 4778          return (err ? set_errno(err) : 0);
4744 4779  }
4745 4780  
4746 4781  /*
4747 4782   * Kills all user processes in the zone, waiting for them all to exit
4748 4783   * before returning.
4749 4784   */
4750 4785  static int
4751 4786  zone_empty(zone_t *zone)
4752 4787  {
4753 4788          int waitstatus;
4754 4789  
4755 4790          /*
4756 4791           * We need to drop zonehash_lock before killing all
4757 4792           * processes, otherwise we'll deadlock with zone_find_*
4758 4793           * which can be called from the exit path.
4759 4794           */
4760 4795          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4761 4796          while ((waitstatus = zone_status_timedwait_sig(zone,
4762 4797              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4763 4798                  killall(zone->zone_id);
4764 4799          }
4765 4800          /*
4766 4801           * return EINTR if we were signaled
4767 4802           */
4768 4803          if (waitstatus == 0)
4769 4804                  return (EINTR);
4770 4805          return (0);
4771 4806  }
4772 4807  
4773 4808  /*
4774 4809   * This function implements the policy for zone visibility.
4775 4810   *
4776 4811   * In standard Solaris, a non-global zone can only see itself.
4777 4812   *
4778 4813   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4779 4814   * it dominates. For this test, the label of the global zone is treated as
4780 4815   * admin_high so it is special-cased instead of being checked for dominance.
4781 4816   *
4782 4817   * Returns true if zone attributes are viewable, false otherwise.
4783 4818   */
4784 4819  static boolean_t
4785 4820  zone_list_access(zone_t *zone)
4786 4821  {
4787 4822  
4788 4823          if (curproc->p_zone == global_zone ||
4789 4824              curproc->p_zone == zone) {
4790 4825                  return (B_TRUE);
4791 4826          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4792 4827                  bslabel_t *curproc_label;
4793 4828                  bslabel_t *zone_label;
4794 4829  
4795 4830                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4796 4831                  zone_label = label2bslabel(zone->zone_slabel);
4797 4832  
4798 4833                  if (zone->zone_id != GLOBAL_ZONEID &&
4799 4834                      bldominates(curproc_label, zone_label)) {
4800 4835                          return (B_TRUE);
4801 4836                  } else {
4802 4837                          return (B_FALSE);
4803 4838                  }
4804 4839          } else {
4805 4840                  return (B_FALSE);
4806 4841          }
4807 4842  }
4808 4843  
4809 4844  /*
4810 4845   * Systemcall to start the zone's halt sequence.  By the time this
4811 4846   * function successfully returns, all user processes and kernel threads
4812 4847   * executing in it will have exited, ZSD shutdown callbacks executed,
4813 4848   * and the zone status set to ZONE_IS_DOWN.
4814 4849   *
4815 4850   * It is possible that the call will interrupt itself if the caller is the
4816 4851   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4817 4852   */
4818 4853  static int
4819 4854  zone_shutdown(zoneid_t zoneid)
4820 4855  {
4821 4856          int error;
4822 4857          zone_t *zone;
4823 4858          zone_status_t status;
4824 4859  
4825 4860          if (secpolicy_zone_config(CRED()) != 0)
4826 4861                  return (set_errno(EPERM));
4827 4862          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4828 4863                  return (set_errno(EINVAL));
4829 4864  
4830 4865          mutex_enter(&zonehash_lock);
4831 4866          /*
4832 4867           * Look for zone under hash lock to prevent races with other
4833 4868           * calls to zone_shutdown and zone_destroy.
4834 4869           */
4835 4870          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4836 4871                  mutex_exit(&zonehash_lock);
4837 4872                  return (set_errno(EINVAL));
4838 4873          }
4839 4874  
4840 4875          /*
4841 4876           * We have to drop zonehash_lock before calling block_mounts.
4842 4877           * Hold the zone so we can continue to use the zone_t.
4843 4878           */
4844 4879          zone_hold(zone);
4845 4880          mutex_exit(&zonehash_lock);
4846 4881  
4847 4882          /*
4848 4883           * Block mounts so that VFS_MOUNT() can get an accurate view of
4849 4884           * the zone's status with regards to ZONE_IS_SHUTTING down.
4850 4885           *
4851 4886           * e.g. NFS can fail the mount if it determines that the zone
4852 4887           * has already begun the shutdown sequence.
4853 4888           *
4854 4889           */
4855 4890          if (block_mounts(zone) == 0) {
4856 4891                  zone_rele(zone);
4857 4892                  return (set_errno(EINTR));
4858 4893          }
4859 4894  
4860 4895          mutex_enter(&zonehash_lock);
4861 4896          mutex_enter(&zone_status_lock);
4862 4897          status = zone_status_get(zone);
4863 4898          /*
4864 4899           * Fail if the zone isn't fully initialized yet.
4865 4900           */
4866 4901          if (status < ZONE_IS_READY) {
4867 4902                  mutex_exit(&zone_status_lock);
4868 4903                  mutex_exit(&zonehash_lock);
4869 4904                  resume_mounts(zone);
4870 4905                  zone_rele(zone);
4871 4906                  return (set_errno(EINVAL));
4872 4907          }
4873 4908          /*
4874 4909           * If conditions required for zone_shutdown() to return have been met,
4875 4910           * return success.
4876 4911           */
4877 4912          if (status >= ZONE_IS_DOWN) {
4878 4913                  mutex_exit(&zone_status_lock);
4879 4914                  mutex_exit(&zonehash_lock);
4880 4915                  resume_mounts(zone);
4881 4916                  zone_rele(zone);
4882 4917                  return (0);
4883 4918          }
4884 4919          /*
4885 4920           * If zone_shutdown() hasn't been called before, go through the motions.
4886 4921           * If it has, there's nothing to do but wait for the kernel threads to
4887 4922           * drain.
4888 4923           */
4889 4924          if (status < ZONE_IS_EMPTY) {
4890 4925                  uint_t ntasks;
4891 4926  
4892 4927                  mutex_enter(&zone->zone_lock);
4893 4928                  if ((ntasks = zone->zone_ntasks) != 1) {
4894 4929                          /*
4895 4930                           * There's still stuff running.
4896 4931                           */
4897 4932                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4898 4933                  }
4899 4934                  mutex_exit(&zone->zone_lock);
4900 4935                  if (ntasks == 1) {
4901 4936                          /*
4902 4937                           * The only way to create another task is through
4903 4938                           * zone_enter(), which will block until we drop
4904 4939                           * zonehash_lock.  The zone is empty.
4905 4940                           */
4906 4941                          if (zone->zone_kthreads == NULL) {
4907 4942                                  /*
4908 4943                                   * Skip ahead to ZONE_IS_DOWN
4909 4944                                   */
4910 4945                                  zone_status_set(zone, ZONE_IS_DOWN);
4911 4946                          } else {
4912 4947                                  zone_status_set(zone, ZONE_IS_EMPTY);
4913 4948                          }
4914 4949                  }
4915 4950          }
4916 4951          mutex_exit(&zone_status_lock);
4917 4952          mutex_exit(&zonehash_lock);
4918 4953          resume_mounts(zone);
4919 4954  
4920 4955          if (error = zone_empty(zone)) {
4921 4956                  zone_rele(zone);
4922 4957                  return (set_errno(error));
4923 4958          }
4924 4959          /*
4925 4960           * After the zone status goes to ZONE_IS_DOWN this zone will no
4926 4961           * longer be notified of changes to the pools configuration, so
4927 4962           * in order to not end up with a stale pool pointer, we point
4928 4963           * ourselves at the default pool and remove all resource
4929 4964           * visibility.  This is especially important as the zone_t may
4930 4965           * languish on the deathrow for a very long time waiting for
4931 4966           * cred's to drain out.
4932 4967           *
4933 4968           * This rebinding of the zone can happen multiple times
4934 4969           * (presumably due to interrupted or parallel systemcalls)
4935 4970           * without any adverse effects.
4936 4971           */
4937 4972          if (pool_lock_intr() != 0) {
4938 4973                  zone_rele(zone);
4939 4974                  return (set_errno(EINTR));
4940 4975          }
4941 4976          if (pool_state == POOL_ENABLED) {
4942 4977                  mutex_enter(&cpu_lock);
4943 4978                  zone_pool_set(zone, pool_default);
4944 4979                  /*
4945 4980                   * The zone no longer needs to be able to see any cpus.
4946 4981                   */
4947 4982                  zone_pset_set(zone, ZONE_PS_INVAL);
4948 4983                  mutex_exit(&cpu_lock);
4949 4984          }
4950 4985          pool_unlock();
4951 4986  
4952 4987          /*
4953 4988           * ZSD shutdown callbacks can be executed multiple times, hence
4954 4989           * it is safe to not be holding any locks across this call.
4955 4990           */
4956 4991          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4957 4992  
4958 4993          mutex_enter(&zone_status_lock);
4959 4994          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4960 4995                  zone_status_set(zone, ZONE_IS_DOWN);
4961 4996          mutex_exit(&zone_status_lock);
4962 4997  
4963 4998          /*
4964 4999           * Wait for kernel threads to drain.
4965 5000           */
4966 5001          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4967 5002                  zone_rele(zone);
4968 5003                  return (set_errno(EINTR));
4969 5004          }
4970 5005  
4971 5006          /*
4972 5007           * Zone can be become down/destroyable even if the above wait
4973 5008           * returns EINTR, so any code added here may never execute.
4974 5009           * (i.e. don't add code here)
4975 5010           */
4976 5011  
4977 5012          zone_rele(zone);
4978 5013          return (0);
4979 5014  }
4980 5015  
4981 5016  /*
4982 5017   * Log the specified zone's reference counts.  The caller should not be
4983 5018   * holding the zone's zone_lock.
4984 5019   */
4985 5020  static void
4986 5021  zone_log_refcounts(zone_t *zone)
4987 5022  {
4988 5023          char *buffer;
4989 5024          char *buffer_position;
4990 5025          uint32_t buffer_size;
4991 5026          uint32_t index;
4992 5027          uint_t ref;
4993 5028          uint_t cred_ref;
4994 5029  
4995 5030          /*
4996 5031           * Construct a string representing the subsystem-specific reference
4997 5032           * counts.  The counts are printed in ascending order by index into the
4998 5033           * zone_t::zone_subsys_ref array.  The list will be surrounded by
4999 5034           * square brackets [] and will only contain nonzero reference counts.
5000 5035           *
5001 5036           * The buffer will hold two square bracket characters plus ten digits,
5002 5037           * one colon, one space, one comma, and some characters for a
5003 5038           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5004 5039           * bit integers have at most ten decimal digits.)  The last
5005 5040           * reference count's comma is replaced by the closing square
5006 5041           * bracket and a NULL character to terminate the string.
5007 5042           *
5008 5043           * NOTE: We have to grab the zone's zone_lock to create a consistent
5009 5044           * snapshot of the zone's reference counters.
5010 5045           *
5011 5046           * First, figure out how much space the string buffer will need.
5012 5047           * The buffer's size is stored in buffer_size.
5013 5048           */
5014 5049          buffer_size = 2;                        /* for the square brackets */
5015 5050          mutex_enter(&zone->zone_lock);
5016 5051          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5017 5052          ref = zone->zone_ref;
5018 5053          cred_ref = zone->zone_cred_ref;
5019 5054          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5020 5055                  if (zone->zone_subsys_ref[index] != 0)
5021 5056                          buffer_size += strlen(zone_ref_subsys_names[index]) +
5022 5057                              13;
5023 5058          if (buffer_size == 2) {
5024 5059                  /*
5025 5060                   * No subsystems had nonzero reference counts.  Don't bother
5026 5061                   * with allocating a buffer; just log the general-purpose and
5027 5062                   * credential reference counts.
5028 5063                   */
5029 5064                  mutex_exit(&zone->zone_lock);
5030 5065                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5031 5066                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
5032 5067                      "references and %u credential references are still extant",
5033 5068                      zone->zone_name, zone->zone_id, ref, cred_ref);
5034 5069                  return;
5035 5070          }
5036 5071  
5037 5072          /*
5038 5073           * buffer_size contains the exact number of characters that the
5039 5074           * buffer will need.  Allocate the buffer and fill it with nonzero
5040 5075           * subsystem-specific reference counts.  Surround the results with
5041 5076           * square brackets afterwards.
5042 5077           */
5043 5078          buffer = kmem_alloc(buffer_size, KM_SLEEP);
5044 5079          buffer_position = &buffer[1];
5045 5080          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5046 5081                  /*
5047 5082                   * NOTE: The DDI's version of sprintf() returns a pointer to
5048 5083                   * the modified buffer rather than the number of bytes written
5049 5084                   * (as in snprintf(3C)).  This is unfortunate and annoying.
5050 5085                   * Therefore, we'll use snprintf() with INT_MAX to get the
5051 5086                   * number of bytes written.  Using INT_MAX is safe because
5052 5087                   * the buffer is perfectly sized for the data: we'll never
5053 5088                   * overrun the buffer.
5054 5089                   */
5055 5090                  if (zone->zone_subsys_ref[index] != 0)
5056 5091                          buffer_position += snprintf(buffer_position, INT_MAX,
5057 5092                              "%s: %u,", zone_ref_subsys_names[index],
5058 5093                              zone->zone_subsys_ref[index]);
5059 5094          }
5060 5095          mutex_exit(&zone->zone_lock);
5061 5096          buffer[0] = '[';
5062 5097          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5063 5098          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5064 5099          buffer_position[-1] = ']';
5065 5100  
5066 5101          /*
5067 5102           * Log the reference counts and free the message buffer.
5068 5103           */
5069 5104          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5070 5105              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5071 5106              "%u credential references are still extant %s", zone->zone_name,
5072 5107              zone->zone_id, ref, cred_ref, buffer);
5073 5108          kmem_free(buffer, buffer_size);
5074 5109  }
5075 5110  
5076 5111  /*
5077 5112   * Systemcall entry point to finalize the zone halt process.  The caller
5078 5113   * must have already successfully called zone_shutdown().
5079 5114   *
5080 5115   * Upon successful completion, the zone will have been fully destroyed:
5081 5116   * zsched will have exited, destructor callbacks executed, and the zone
5082 5117   * removed from the list of active zones.
5083 5118   */
5084 5119  static int
5085 5120  zone_destroy(zoneid_t zoneid)
5086 5121  {
5087 5122          uint64_t uniqid;
5088 5123          zone_t *zone;
5089 5124          zone_status_t status;
5090 5125          clock_t wait_time;
5091 5126          boolean_t log_refcounts;
5092 5127  
5093 5128          if (secpolicy_zone_config(CRED()) != 0)
5094 5129                  return (set_errno(EPERM));
5095 5130          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5096 5131                  return (set_errno(EINVAL));
5097 5132  
5098 5133          mutex_enter(&zonehash_lock);
5099 5134          /*
5100 5135           * Look for zone under hash lock to prevent races with other
5101 5136           * calls to zone_destroy.
5102 5137           */
5103 5138          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5104 5139                  mutex_exit(&zonehash_lock);
5105 5140                  return (set_errno(EINVAL));
5106 5141          }
5107 5142  
5108 5143          if (zone_mount_count(zone->zone_rootpath) != 0) {
5109 5144                  mutex_exit(&zonehash_lock);
5110 5145                  return (set_errno(EBUSY));
5111 5146          }
5112 5147          mutex_enter(&zone_status_lock);
5113 5148          status = zone_status_get(zone);
5114 5149          if (status < ZONE_IS_DOWN) {
5115 5150                  mutex_exit(&zone_status_lock);
5116 5151                  mutex_exit(&zonehash_lock);
5117 5152                  return (set_errno(EBUSY));
5118 5153          } else if (status == ZONE_IS_DOWN) {
5119 5154                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5120 5155          }
5121 5156          mutex_exit(&zone_status_lock);
5122 5157          zone_hold(zone);
5123 5158          mutex_exit(&zonehash_lock);
5124 5159  
5125 5160          /*
5126 5161           * wait for zsched to exit
5127 5162           */
5128 5163          zone_status_wait(zone, ZONE_IS_DEAD);
5129 5164          zone_zsd_callbacks(zone, ZSD_DESTROY);
5130 5165          zone->zone_netstack = NULL;
5131 5166          uniqid = zone->zone_uniqid;
5132 5167          zone_rele(zone);
5133 5168          zone = NULL;    /* potentially free'd */
5134 5169  
5135 5170          log_refcounts = B_FALSE;
5136 5171          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5137 5172          mutex_enter(&zonehash_lock);
5138 5173          for (; /* ever */; ) {
5139 5174                  boolean_t unref;
5140 5175                  boolean_t refs_have_been_logged;
5141 5176  
5142 5177                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5143 5178                      zone->zone_uniqid != uniqid) {
5144 5179                          /*
5145 5180                           * The zone has gone away.  Necessary conditions
5146 5181                           * are met, so we return success.
5147 5182                           */
5148 5183                          mutex_exit(&zonehash_lock);
5149 5184                          return (0);
5150 5185                  }
5151 5186                  mutex_enter(&zone->zone_lock);
5152 5187                  unref = ZONE_IS_UNREF(zone);
5153 5188                  refs_have_been_logged = (zone->zone_flags &
5154 5189                      ZF_REFCOUNTS_LOGGED);
5155 5190                  mutex_exit(&zone->zone_lock);
5156 5191                  if (unref) {
5157 5192                          /*
5158 5193                           * There is only one reference to the zone -- that
5159 5194                           * added when the zone was added to the hashtables --
5160 5195                           * and things will remain this way until we drop
5161 5196                           * zonehash_lock... we can go ahead and cleanup the
5162 5197                           * zone.
5163 5198                           */
5164 5199                          break;
5165 5200                  }
5166 5201  
5167 5202                  /*
5168 5203                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5169 5204                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5170 5205                   * some zone's general-purpose reference count reaches one.
5171 5206                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5172 5207                   * on zone_destroy_cv, then log the zone's reference counts and
5173 5208                   * continue to wait for zone_rele() and zone_cred_rele().
5174 5209                   */
5175 5210                  if (!refs_have_been_logged) {
5176 5211                          if (!log_refcounts) {
5177 5212                                  /*
5178 5213                                   * This thread hasn't timed out waiting on
5179 5214                                   * zone_destroy_cv yet.  Wait wait_time clock
5180 5215                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5181 5216                                   * seconds) for the zone's references to clear.
5182 5217                                   */
5183 5218                                  ASSERT(wait_time > 0);
5184 5219                                  wait_time = cv_reltimedwait_sig(
5185 5220                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5186 5221                                      TR_SEC);
5187 5222                                  if (wait_time > 0) {
5188 5223                                          /*
5189 5224                                           * A thread in zone_rele() or
5190 5225                                           * zone_cred_rele() signaled
5191 5226                                           * zone_destroy_cv before this thread's
5192 5227                                           * wait timed out.  The zone might have
5193 5228                                           * only one reference left; find out!
5194 5229                                           */
5195 5230                                          continue;
5196 5231                                  } else if (wait_time == 0) {
5197 5232                                          /* The thread's process was signaled. */
5198 5233                                          mutex_exit(&zonehash_lock);
5199 5234                                          return (set_errno(EINTR));
5200 5235                                  }
5201 5236  
5202 5237                                  /*
5203 5238                                   * The thread timed out while waiting on
5204 5239                                   * zone_destroy_cv.  Even though the thread
5205 5240                                   * timed out, it has to check whether another
5206 5241                                   * thread woke up from zone_destroy_cv and
5207 5242                                   * destroyed the zone.
5208 5243                                   *
5209 5244                                   * If the zone still exists and has more than
5210 5245                                   * one unreleased general-purpose reference,
5211 5246                                   * then log the zone's reference counts.
5212 5247                                   */
5213 5248                                  log_refcounts = B_TRUE;
5214 5249                                  continue;
5215 5250                          }
5216 5251  
5217 5252                          /*
5218 5253                           * The thread already timed out on zone_destroy_cv while
5219 5254                           * waiting for subsystems to release the zone's last
5220 5255                           * general-purpose references.  Log the zone's reference
5221 5256                           * counts and wait indefinitely on zone_destroy_cv.
5222 5257                           */
5223 5258                          zone_log_refcounts(zone);
5224 5259                  }
5225 5260                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5226 5261                          /* The thread's process was signaled. */
5227 5262                          mutex_exit(&zonehash_lock);
5228 5263                          return (set_errno(EINTR));
5229 5264                  }
5230 5265          }
5231 5266  
5232 5267          /*
5233 5268           * Remove CPU cap for this zone now since we're not going to
5234 5269           * fail below this point.
5235 5270           */
5236 5271          cpucaps_zone_remove(zone);
5237 5272  
5238 5273          /* Get rid of the zone's kstats */
5239 5274          zone_kstat_delete(zone);
5240 5275  
5241 5276          /* remove the pfexecd doors */
5242 5277          if (zone->zone_pfexecd != NULL) {
5243 5278                  klpd_freelist(&zone->zone_pfexecd);
5244 5279                  zone->zone_pfexecd = NULL;
5245 5280          }
5246 5281  
5247 5282          /* free brand specific data */
5248 5283          if (ZONE_IS_BRANDED(zone))
5249 5284                  ZBROP(zone)->b_free_brand_data(zone);
5250 5285  
5251 5286          /* Say goodbye to brand framework. */
5252 5287          brand_unregister_zone(zone->zone_brand);
5253 5288  
5254 5289          /*
5255 5290           * It is now safe to let the zone be recreated; remove it from the
5256 5291           * lists.  The memory will not be freed until the last cred
5257 5292           * reference goes away.
5258 5293           */
5259 5294          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5260 5295          zonecount--;
5261 5296          /* remove from active list and hash tables */
5262 5297          list_remove(&zone_active, zone);
5263 5298          (void) mod_hash_destroy(zonehashbyname,
5264 5299              (mod_hash_key_t)zone->zone_name);
5265 5300          (void) mod_hash_destroy(zonehashbyid,
5266 5301              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5267 5302          if (zone->zone_flags & ZF_HASHED_LABEL)
5268 5303                  (void) mod_hash_destroy(zonehashbylabel,
5269 5304                      (mod_hash_key_t)zone->zone_slabel);
5270 5305          mutex_exit(&zonehash_lock);
5271 5306  
5272 5307          /*
5273 5308           * Release the root vnode; we're not using it anymore.  Nor should any
5274 5309           * other thread that might access it exist.
5275 5310           */
5276 5311          if (zone->zone_rootvp != NULL) {
5277 5312                  VN_RELE(zone->zone_rootvp);
5278 5313                  zone->zone_rootvp = NULL;
5279 5314          }
5280 5315  
5281 5316          /* add to deathrow list */
5282 5317          mutex_enter(&zone_deathrow_lock);
5283 5318          list_insert_tail(&zone_deathrow, zone);
5284 5319          mutex_exit(&zone_deathrow_lock);
5285 5320  
5286 5321          /*
5287 5322           * Drop last reference (which was added by zsched()), this will
5288 5323           * free the zone unless there are outstanding cred references.
5289 5324           */
5290 5325          zone_rele(zone);
5291 5326          return (0);
5292 5327  }
5293 5328  
5294 5329  /*
5295 5330   * Systemcall entry point for zone_getattr(2).
5296 5331   */
5297 5332  static ssize_t
5298 5333  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5299 5334  {
5300 5335          size_t size;
5301 5336          int error = 0, err;
5302 5337          zone_t *zone;
5303 5338          char *zonepath;
5304 5339          char *outstr;
5305 5340          zone_status_t zone_status;
5306 5341          pid_t initpid;
5307 5342          boolean_t global = (curzone == global_zone);
5308 5343          boolean_t inzone = (curzone->zone_id == zoneid);
5309 5344          ushort_t flags;
5310 5345          zone_net_data_t *zbuf;
5311 5346  
5312 5347          mutex_enter(&zonehash_lock);
5313 5348          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5314 5349                  mutex_exit(&zonehash_lock);
5315 5350                  return (set_errno(EINVAL));
5316 5351          }
5317 5352          zone_status = zone_status_get(zone);
5318 5353          if (zone_status < ZONE_IS_INITIALIZED) {
5319 5354                  mutex_exit(&zonehash_lock);
5320 5355                  return (set_errno(EINVAL));
5321 5356          }
5322 5357          zone_hold(zone);
5323 5358          mutex_exit(&zonehash_lock);
5324 5359  
5325 5360          /*
5326 5361           * If not in the global zone, don't show information about other zones,
5327 5362           * unless the system is labeled and the local zone's label dominates
5328 5363           * the other zone.
5329 5364           */
5330 5365          if (!zone_list_access(zone)) {
5331 5366                  zone_rele(zone);
5332 5367                  return (set_errno(EINVAL));
5333 5368          }
5334 5369  
5335 5370          switch (attr) {
5336 5371          case ZONE_ATTR_ROOT:
5337 5372                  if (global) {
5338 5373                          /*
5339 5374                           * Copy the path to trim the trailing "/" (except for
5340 5375                           * the global zone).
5341 5376                           */
5342 5377                          if (zone != global_zone)
5343 5378                                  size = zone->zone_rootpathlen - 1;
5344 5379                          else
5345 5380                                  size = zone->zone_rootpathlen;
5346 5381                          zonepath = kmem_alloc(size, KM_SLEEP);
5347 5382                          bcopy(zone->zone_rootpath, zonepath, size);
5348 5383                          zonepath[size - 1] = '\0';
5349 5384                  } else {
5350 5385                          if (inzone || !is_system_labeled()) {
5351 5386                                  /*
5352 5387                                   * Caller is not in the global zone.
5353 5388                                   * if the query is on the current zone
5354 5389                                   * or the system is not labeled,
5355 5390                                   * just return faked-up path for current zone.
5356 5391                                   */
5357 5392                                  zonepath = "/";
5358 5393                                  size = 2;
5359 5394                          } else {
5360 5395                                  /*
5361 5396                                   * Return related path for current zone.
5362 5397                                   */
5363 5398                                  int prefix_len = strlen(zone_prefix);
5364 5399                                  int zname_len = strlen(zone->zone_name);
5365 5400  
5366 5401                                  size = prefix_len + zname_len + 1;
5367 5402                                  zonepath = kmem_alloc(size, KM_SLEEP);
5368 5403                                  bcopy(zone_prefix, zonepath, prefix_len);
5369 5404                                  bcopy(zone->zone_name, zonepath +
5370 5405                                      prefix_len, zname_len);
5371 5406                                  zonepath[size - 1] = '\0';
5372 5407                          }
5373 5408                  }
5374 5409                  if (bufsize > size)
5375 5410                          bufsize = size;
5376 5411                  if (buf != NULL) {
5377 5412                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5378 5413                          if (err != 0 && err != ENAMETOOLONG)
5379 5414                                  error = EFAULT;
5380 5415                  }
5381 5416                  if (global || (is_system_labeled() && !inzone))
5382 5417                          kmem_free(zonepath, size);
5383 5418                  break;
5384 5419  
5385 5420          case ZONE_ATTR_NAME:
5386 5421                  size = strlen(zone->zone_name) + 1;
5387 5422                  if (bufsize > size)
5388 5423                          bufsize = size;
5389 5424                  if (buf != NULL) {
5390 5425                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5391 5426                          if (err != 0 && err != ENAMETOOLONG)
5392 5427                                  error = EFAULT;
5393 5428                  }
5394 5429                  break;
5395 5430  
5396 5431          case ZONE_ATTR_STATUS:
5397 5432                  /*
5398 5433                   * Since we're not holding zonehash_lock, the zone status
5399 5434                   * may be anything; leave it up to userland to sort it out.
5400 5435                   */
5401 5436                  size = sizeof (zone_status);
5402 5437                  if (bufsize > size)
5403 5438                          bufsize = size;
5404 5439                  zone_status = zone_status_get(zone);
5405 5440                  if (buf != NULL &&
5406 5441                      copyout(&zone_status, buf, bufsize) != 0)
5407 5442                          error = EFAULT;
5408 5443                  break;
5409 5444          case ZONE_ATTR_FLAGS:
5410 5445                  size = sizeof (zone->zone_flags);
5411 5446                  if (bufsize > size)
5412 5447                          bufsize = size;
5413 5448                  flags = zone->zone_flags;
5414 5449                  if (buf != NULL &&
5415 5450                      copyout(&flags, buf, bufsize) != 0)
5416 5451                          error = EFAULT;
5417 5452                  break;
5418 5453          case ZONE_ATTR_PRIVSET:
5419 5454                  size = sizeof (priv_set_t);
5420 5455                  if (bufsize > size)
5421 5456                          bufsize = size;
5422 5457                  if (buf != NULL &&
5423 5458                      copyout(zone->zone_privset, buf, bufsize) != 0)
5424 5459                          error = EFAULT;
5425 5460                  break;
5426 5461          case ZONE_ATTR_UNIQID:
5427 5462                  size = sizeof (zone->zone_uniqid);
5428 5463                  if (bufsize > size)
5429 5464                          bufsize = size;
5430 5465                  if (buf != NULL &&
5431 5466                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5432 5467                          error = EFAULT;
5433 5468                  break;
5434 5469          case ZONE_ATTR_POOLID:
5435 5470                  {
5436 5471                          pool_t *pool;
5437 5472                          poolid_t poolid;
5438 5473  
5439 5474                          if (pool_lock_intr() != 0) {
5440 5475                                  error = EINTR;
5441 5476                                  break;
5442 5477                          }
5443 5478                          pool = zone_pool_get(zone);
5444 5479                          poolid = pool->pool_id;
5445 5480                          pool_unlock();
5446 5481                          size = sizeof (poolid);
5447 5482                          if (bufsize > size)
5448 5483                                  bufsize = size;
5449 5484                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5450 5485                                  error = EFAULT;
5451 5486                  }
5452 5487                  break;
5453 5488          case ZONE_ATTR_SLBL:
5454 5489                  size = sizeof (bslabel_t);
5455 5490                  if (bufsize > size)
5456 5491                          bufsize = size;
5457 5492                  if (zone->zone_slabel == NULL)
5458 5493                          error = EINVAL;
5459 5494                  else if (buf != NULL &&
5460 5495                      copyout(label2bslabel(zone->zone_slabel), buf,
5461 5496                      bufsize) != 0)
5462 5497                          error = EFAULT;
5463 5498                  break;
5464 5499          case ZONE_ATTR_INITPID:
5465 5500                  size = sizeof (initpid);
5466 5501                  if (bufsize > size)
5467 5502                          bufsize = size;
5468 5503                  initpid = zone->zone_proc_initpid;
5469 5504                  if (initpid == -1) {
5470 5505                          error = ESRCH;
5471 5506                          break;
5472 5507                  }
5473 5508                  if (buf != NULL &&
5474 5509                      copyout(&initpid, buf, bufsize) != 0)
5475 5510                          error = EFAULT;
5476 5511                  break;
5477 5512          case ZONE_ATTR_BRAND:
5478 5513                  size = strlen(zone->zone_brand->b_name) + 1;
5479 5514  
5480 5515                  if (bufsize > size)
5481 5516                          bufsize = size;
5482 5517                  if (buf != NULL) {
5483 5518                          err = copyoutstr(zone->zone_brand->b_name, buf,
5484 5519                              bufsize, NULL);
5485 5520                          if (err != 0 && err != ENAMETOOLONG)
5486 5521                                  error = EFAULT;
5487 5522                  }
5488 5523                  break;
5489 5524          case ZONE_ATTR_INITNAME:
5490 5525                  size = strlen(zone->zone_initname) + 1;
5491 5526                  if (bufsize > size)
5492 5527                          bufsize = size;
5493 5528                  if (buf != NULL) {
5494 5529                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5495 5530                              NULL);
5496 5531                          if (err != 0 && err != ENAMETOOLONG)
5497 5532                                  error = EFAULT;
5498 5533                  }
5499 5534                  break;
5500 5535          case ZONE_ATTR_BOOTARGS:
5501 5536                  if (zone->zone_bootargs == NULL)
5502 5537                          outstr = "";
5503 5538                  else
5504 5539                          outstr = zone->zone_bootargs;
5505 5540                  size = strlen(outstr) + 1;
5506 5541                  if (bufsize > size)
5507 5542                          bufsize = size;
5508 5543                  if (buf != NULL) {
5509 5544                          err = copyoutstr(outstr, buf, bufsize, NULL);
5510 5545                          if (err != 0 && err != ENAMETOOLONG)
5511 5546                                  error = EFAULT;
5512 5547                  }
5513 5548                  break;
5514 5549          case ZONE_ATTR_PHYS_MCAP:
5515 5550                  size = sizeof (zone->zone_phys_mcap);
5516 5551                  if (bufsize > size)
5517 5552                          bufsize = size;
5518 5553                  if (buf != NULL &&
5519 5554                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5520 5555                          error = EFAULT;
5521 5556                  break;
5522 5557          case ZONE_ATTR_SCHED_CLASS:
5523 5558                  mutex_enter(&class_lock);
5524 5559  
5525 5560                  if (zone->zone_defaultcid >= loaded_classes)
5526 5561                          outstr = "";
5527 5562                  else
5528 5563                          outstr = sclass[zone->zone_defaultcid].cl_name;
5529 5564                  size = strlen(outstr) + 1;
5530 5565                  if (bufsize > size)
5531 5566                          bufsize = size;
5532 5567                  if (buf != NULL) {
5533 5568                          err = copyoutstr(outstr, buf, bufsize, NULL);
5534 5569                          if (err != 0 && err != ENAMETOOLONG)
5535 5570                                  error = EFAULT;
5536 5571                  }
5537 5572  
5538 5573                  mutex_exit(&class_lock);
5539 5574                  break;
5540 5575          case ZONE_ATTR_HOSTID:
5541 5576                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5542 5577                      bufsize == sizeof (zone->zone_hostid)) {
5543 5578                          size = sizeof (zone->zone_hostid);
5544 5579                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5545 5580                              bufsize) != 0)
5546 5581                                  error = EFAULT;
5547 5582                  } else {
5548 5583                          error = EINVAL;
5549 5584                  }
5550 5585                  break;
5551 5586          case ZONE_ATTR_FS_ALLOWED:
5552 5587                  if (zone->zone_fs_allowed == NULL)
5553 5588                          outstr = "";
5554 5589                  else

↓ open down ↓

1101 lines elided

↑ open up ↑

5555 5590                          outstr = zone->zone_fs_allowed;
5556 5591                  size = strlen(outstr) + 1;
5557 5592                  if (bufsize > size)
5558 5593                          bufsize = size;
5559 5594                  if (buf != NULL) {
5560 5595                          err = copyoutstr(outstr, buf, bufsize, NULL);
5561 5596                          if (err != 0 && err != ENAMETOOLONG)
5562 5597                                  error = EFAULT;
5563 5598                  }
5564 5599                  break;
     5600 +        case ZONE_ATTR_SECFLAGS:
     5601 +                size = sizeof (zone->zone_secflags);
     5602 +                if (bufsize > size)
     5603 +                        bufsize = size;
     5604 +                if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
     5605 +                        error = EFAULT;
     5606 +                break;
5565 5607          case ZONE_ATTR_NETWORK:
5566 5608                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5567 5609                  if (copyin(buf, zbuf, bufsize) != 0) {
5568 5610                          error = EFAULT;
5569 5611                  } else {
5570 5612                          error = zone_get_network(zoneid, zbuf);
5571 5613                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5572 5614                                  error = EFAULT;
5573 5615                  }
5574 5616                  kmem_free(zbuf, bufsize);

5575 5617                  break;
5576 5618          default:
5577 5619                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5578 5620                          size = bufsize;
5579 5621                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5580 5622                  } else {
5581 5623                          error = EINVAL;
5582 5624                  }
5583 5625          }
5584 5626          zone_rele(zone);
5585 5627  
5586 5628          if (error)
5587 5629                  return (set_errno(error));
5588 5630          return ((ssize_t)size);
5589 5631  }
5590 5632  
5591 5633  /*
5592 5634   * Systemcall entry point for zone_setattr(2).
5593 5635   */
5594 5636  /*ARGSUSED*/
5595 5637  static int
5596 5638  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5597 5639  {
5598 5640          zone_t *zone;
5599 5641          zone_status_t zone_status;
5600 5642          int err = -1;
5601 5643          zone_net_data_t *zbuf;
5602 5644  
5603 5645          if (secpolicy_zone_config(CRED()) != 0)
5604 5646                  return (set_errno(EPERM));
5605 5647  
5606 5648          /*
5607 5649           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5608 5650           * global zone.
5609 5651           */
5610 5652          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5611 5653                  return (set_errno(EINVAL));
5612 5654          }
5613 5655  
5614 5656          mutex_enter(&zonehash_lock);
5615 5657          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5616 5658                  mutex_exit(&zonehash_lock);
5617 5659                  return (set_errno(EINVAL));
5618 5660          }
5619 5661          zone_hold(zone);
5620 5662          mutex_exit(&zonehash_lock);
5621 5663  
5622 5664          /*
5623 5665           * At present most attributes can only be set on non-running,
5624 5666           * non-global zones.
5625 5667           */
5626 5668          zone_status = zone_status_get(zone);
5627 5669          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5628 5670                  err = EINVAL;
5629 5671                  goto done;
5630 5672          }
5631 5673  
5632 5674          switch (attr) {
5633 5675          case ZONE_ATTR_INITNAME:
5634 5676                  err = zone_set_initname(zone, (const char *)buf);
5635 5677                  break;
5636 5678          case ZONE_ATTR_INITNORESTART:
5637 5679                  zone->zone_restart_init = B_FALSE;
5638 5680                  err = 0;

↓ open down ↓

64 lines elided

↑ open up ↑

5639 5681                  break;
5640 5682          case ZONE_ATTR_BOOTARGS:
5641 5683                  err = zone_set_bootargs(zone, (const char *)buf);
5642 5684                  break;
5643 5685          case ZONE_ATTR_BRAND:
5644 5686                  err = zone_set_brand(zone, (const char *)buf);
5645 5687                  break;
5646 5688          case ZONE_ATTR_FS_ALLOWED:
5647 5689                  err = zone_set_fs_allowed(zone, (const char *)buf);
5648 5690                  break;
     5691 +        case ZONE_ATTR_SECFLAGS:
     5692 +                err = zone_set_secflags(zone, (psecflags_t *)buf);
     5693 +                break;
5649 5694          case ZONE_ATTR_PHYS_MCAP:
5650 5695                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5651 5696                  break;
5652 5697          case ZONE_ATTR_SCHED_CLASS:
5653 5698                  err = zone_set_sched_class(zone, (const char *)buf);
5654 5699                  break;
5655 5700          case ZONE_ATTR_HOSTID:
5656 5701                  if (bufsize == sizeof (zone->zone_hostid)) {
5657 5702                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5658 5703                                  err = 0;

5659 5704                          else
5660 5705                                  err = EFAULT;
5661 5706                  } else {
5662 5707                          err = EINVAL;
5663 5708                  }
5664 5709                  break;
5665 5710          case ZONE_ATTR_NETWORK:
5666 5711                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5667 5712                          err = EINVAL;
5668 5713                          break;
5669 5714                  }
5670 5715                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5671 5716                  if (copyin(buf, zbuf, bufsize) != 0) {
5672 5717                          kmem_free(zbuf, bufsize);
5673 5718                          err = EFAULT;
5674 5719                          break;
5675 5720                  }
5676 5721                  err = zone_set_network(zoneid, zbuf);
5677 5722                  kmem_free(zbuf, bufsize);
5678 5723                  break;
5679 5724          default:
5680 5725                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5681 5726                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5682 5727                  else
5683 5728                          err = EINVAL;
5684 5729          }
5685 5730  
5686 5731  done:
5687 5732          zone_rele(zone);
5688 5733          ASSERT(err != -1);
5689 5734          return (err != 0 ? set_errno(err) : 0);
5690 5735  }
5691 5736  
5692 5737  /*
5693 5738   * Return zero if the process has at least one vnode mapped in to its
5694 5739   * address space which shouldn't be allowed to change zones.
5695 5740   *
5696 5741   * Also return zero if the process has any shared mappings which reserve
5697 5742   * swap.  This is because the counting for zone.max-swap does not allow swap
5698 5743   * reservation to be shared between zones.  zone swap reservation is counted
5699 5744   * on zone->zone_max_swap.
5700 5745   */
5701 5746  static int
5702 5747  as_can_change_zones(void)
5703 5748  {
5704 5749          proc_t *pp = curproc;
5705 5750          struct seg *seg;
5706 5751          struct as *as = pp->p_as;
5707 5752          vnode_t *vp;
5708 5753          int allow = 1;
5709 5754  
5710 5755          ASSERT(pp->p_as != &kas);
5711 5756          AS_LOCK_ENTER(as, RW_READER);
5712 5757          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5713 5758  
5714 5759                  /*
5715 5760                   * Cannot enter zone with shared anon memory which
5716 5761                   * reserves swap.  See comment above.
5717 5762                   */
5718 5763                  if (seg_can_change_zones(seg) == B_FALSE) {
5719 5764                          allow = 0;
5720 5765                          break;
5721 5766                  }
5722 5767                  /*
5723 5768                   * if we can't get a backing vnode for this segment then skip
5724 5769                   * it.
5725 5770                   */
5726 5771                  vp = NULL;
5727 5772                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5728 5773                          continue;
5729 5774                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5730 5775                          allow = 0;
5731 5776                          break;
5732 5777                  }
5733 5778          }
5734 5779          AS_LOCK_EXIT(as);
5735 5780          return (allow);
5736 5781  }
5737 5782  
5738 5783  /*
5739 5784   * Count swap reserved by curproc's address space
5740 5785   */
5741 5786  static size_t
5742 5787  as_swresv(void)
5743 5788  {
5744 5789          proc_t *pp = curproc;
5745 5790          struct seg *seg;
5746 5791          struct as *as = pp->p_as;
5747 5792          size_t swap = 0;
5748 5793  
5749 5794          ASSERT(pp->p_as != &kas);
5750 5795          ASSERT(AS_WRITE_HELD(as));
5751 5796          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5752 5797                  swap += seg_swresv(seg);
5753 5798  
5754 5799          return (swap);
5755 5800  }
5756 5801  
5757 5802  /*
5758 5803   * Systemcall entry point for zone_enter().
5759 5804   *
5760 5805   * The current process is injected into said zone.  In the process
5761 5806   * it will change its project membership, privileges, rootdir/cwd,
5762 5807   * zone-wide rctls, and pool association to match those of the zone.
5763 5808   *
5764 5809   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5765 5810   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5766 5811   * enter a zone that is "ready" or "running".
5767 5812   */
5768 5813  static int
5769 5814  zone_enter(zoneid_t zoneid)
5770 5815  {
5771 5816          zone_t *zone;
5772 5817          vnode_t *vp;
5773 5818          proc_t *pp = curproc;
5774 5819          contract_t *ct;
5775 5820          cont_process_t *ctp;
5776 5821          task_t *tk, *oldtk;
5777 5822          kproject_t *zone_proj0;
5778 5823          cred_t *cr, *newcr;
5779 5824          pool_t *oldpool, *newpool;
5780 5825          sess_t *sp;
5781 5826          uid_t uid;
5782 5827          zone_status_t status;
5783 5828          int err = 0;
5784 5829          rctl_entity_p_t e;
5785 5830          size_t swap;
5786 5831          kthread_id_t t;
5787 5832  
5788 5833          if (secpolicy_zone_config(CRED()) != 0)
5789 5834                  return (set_errno(EPERM));
5790 5835          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5791 5836                  return (set_errno(EINVAL));
5792 5837  
5793 5838          /*
5794 5839           * Stop all lwps so we don't need to hold a lock to look at
5795 5840           * curproc->p_zone.  This needs to happen before we grab any
5796 5841           * locks to avoid deadlock (another lwp in the process could
5797 5842           * be waiting for the held lock).
5798 5843           */
5799 5844          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5800 5845                  return (set_errno(EINTR));
5801 5846  
5802 5847          /*
5803 5848           * Make sure we're not changing zones with files open or mapped in
5804 5849           * to our address space which shouldn't be changing zones.
5805 5850           */
5806 5851          if (!files_can_change_zones()) {
5807 5852                  err = EBADF;
5808 5853                  goto out;
5809 5854          }
5810 5855          if (!as_can_change_zones()) {
5811 5856                  err = EFAULT;
5812 5857                  goto out;
5813 5858          }
5814 5859  
5815 5860          mutex_enter(&zonehash_lock);
5816 5861          if (pp->p_zone != global_zone) {
5817 5862                  mutex_exit(&zonehash_lock);
5818 5863                  err = EINVAL;
5819 5864                  goto out;
5820 5865          }
5821 5866  
5822 5867          zone = zone_find_all_by_id(zoneid);
5823 5868          if (zone == NULL) {
5824 5869                  mutex_exit(&zonehash_lock);
5825 5870                  err = EINVAL;
5826 5871                  goto out;
5827 5872          }
5828 5873  
5829 5874          /*
5830 5875           * To prevent processes in a zone from holding contracts on
5831 5876           * extrazonal resources, and to avoid process contract
5832 5877           * memberships which span zones, contract holders and processes
5833 5878           * which aren't the sole members of their encapsulating process
5834 5879           * contracts are not allowed to zone_enter.
5835 5880           */
5836 5881          ctp = pp->p_ct_process;
5837 5882          ct = &ctp->conp_contract;
5838 5883          mutex_enter(&ct->ct_lock);
5839 5884          mutex_enter(&pp->p_lock);
5840 5885          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5841 5886                  mutex_exit(&pp->p_lock);
5842 5887                  mutex_exit(&ct->ct_lock);
5843 5888                  mutex_exit(&zonehash_lock);
5844 5889                  err = EINVAL;
5845 5890                  goto out;
5846 5891          }
5847 5892  
5848 5893          /*
5849 5894           * Moreover, we don't allow processes whose encapsulating
5850 5895           * process contracts have inherited extrazonal contracts.
5851 5896           * While it would be easier to eliminate all process contracts
5852 5897           * with inherited contracts, we need to be able to give a
5853 5898           * restarted init (or other zone-penetrating process) its
5854 5899           * predecessor's contracts.
5855 5900           */
5856 5901          if (ctp->conp_ninherited != 0) {
5857 5902                  contract_t *next;
5858 5903                  for (next = list_head(&ctp->conp_inherited); next;
5859 5904                      next = list_next(&ctp->conp_inherited, next)) {
5860 5905                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5861 5906                                  mutex_exit(&pp->p_lock);
5862 5907                                  mutex_exit(&ct->ct_lock);
5863 5908                                  mutex_exit(&zonehash_lock);
5864 5909                                  err = EINVAL;
5865 5910                                  goto out;
5866 5911                          }
5867 5912                  }
5868 5913          }
5869 5914  
5870 5915          mutex_exit(&pp->p_lock);
5871 5916          mutex_exit(&ct->ct_lock);
5872 5917  
5873 5918          status = zone_status_get(zone);
5874 5919          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5875 5920                  /*
5876 5921                   * Can't join
5877 5922                   */
5878 5923                  mutex_exit(&zonehash_lock);
5879 5924                  err = EINVAL;
5880 5925                  goto out;
5881 5926          }
5882 5927  
5883 5928          /*
5884 5929           * Make sure new priv set is within the permitted set for caller
5885 5930           */
5886 5931          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5887 5932                  mutex_exit(&zonehash_lock);
5888 5933                  err = EPERM;
5889 5934                  goto out;
5890 5935          }
5891 5936          /*
5892 5937           * We want to momentarily drop zonehash_lock while we optimistically
5893 5938           * bind curproc to the pool it should be running in.  This is safe
5894 5939           * since the zone can't disappear (we have a hold on it).
5895 5940           */
5896 5941          zone_hold(zone);
5897 5942          mutex_exit(&zonehash_lock);
5898 5943  
5899 5944          /*
5900 5945           * Grab pool_lock to keep the pools configuration from changing
5901 5946           * and to stop ourselves from getting rebound to another pool
5902 5947           * until we join the zone.
5903 5948           */
5904 5949          if (pool_lock_intr() != 0) {
5905 5950                  zone_rele(zone);
5906 5951                  err = EINTR;
5907 5952                  goto out;
5908 5953          }
5909 5954          ASSERT(secpolicy_pool(CRED()) == 0);
5910 5955          /*
5911 5956           * Bind ourselves to the pool currently associated with the zone.
5912 5957           */
5913 5958          oldpool = curproc->p_pool;
5914 5959          newpool = zone_pool_get(zone);
5915 5960          if (pool_state == POOL_ENABLED && newpool != oldpool &&
5916 5961              (err = pool_do_bind(newpool, P_PID, P_MYID,
5917 5962              POOL_BIND_ALL)) != 0) {
5918 5963                  pool_unlock();
5919 5964                  zone_rele(zone);
5920 5965                  goto out;
5921 5966          }
5922 5967  
5923 5968          /*
5924 5969           * Grab cpu_lock now; we'll need it later when we call
5925 5970           * task_join().
5926 5971           */
5927 5972          mutex_enter(&cpu_lock);
5928 5973          mutex_enter(&zonehash_lock);
5929 5974          /*
5930 5975           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5931 5976           */
5932 5977          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5933 5978                  /*
5934 5979                   * Can't join anymore.
5935 5980                   */
5936 5981                  mutex_exit(&zonehash_lock);
5937 5982                  mutex_exit(&cpu_lock);
5938 5983                  if (pool_state == POOL_ENABLED &&
5939 5984                      newpool != oldpool)
5940 5985                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
5941 5986                              POOL_BIND_ALL);
5942 5987                  pool_unlock();
5943 5988                  zone_rele(zone);
5944 5989                  err = EINVAL;
5945 5990                  goto out;
5946 5991          }
5947 5992  
5948 5993          /*
5949 5994           * a_lock must be held while transfering locked memory and swap
5950 5995           * reservation from the global zone to the non global zone because
5951 5996           * asynchronous faults on the processes' address space can lock
5952 5997           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5953 5998           * segments respectively.
5954 5999           */
5955 6000          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
5956 6001          swap = as_swresv();
5957 6002          mutex_enter(&pp->p_lock);
5958 6003          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5959 6004          /* verify that we do not exceed and task or lwp limits */
5960 6005          mutex_enter(&zone->zone_nlwps_lock);
5961 6006          /* add new lwps to zone and zone's proj0 */
5962 6007          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5963 6008          zone->zone_nlwps += pp->p_lwpcnt;
5964 6009          /* add 1 task to zone's proj0 */
5965 6010          zone_proj0->kpj_ntasks += 1;
5966 6011  
5967 6012          zone_proj0->kpj_nprocs++;
5968 6013          zone->zone_nprocs++;
5969 6014          mutex_exit(&zone->zone_nlwps_lock);
5970 6015  
5971 6016          mutex_enter(&zone->zone_mem_lock);
5972 6017          zone->zone_locked_mem += pp->p_locked_mem;
5973 6018          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5974 6019          zone->zone_max_swap += swap;
5975 6020          mutex_exit(&zone->zone_mem_lock);
5976 6021  
5977 6022          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5978 6023          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5979 6024          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5980 6025  
5981 6026          /* remove lwps and process from proc's old zone and old project */
5982 6027          mutex_enter(&pp->p_zone->zone_nlwps_lock);
5983 6028          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5984 6029          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5985 6030          pp->p_task->tk_proj->kpj_nprocs--;
5986 6031          pp->p_zone->zone_nprocs--;
5987 6032          mutex_exit(&pp->p_zone->zone_nlwps_lock);
5988 6033  
5989 6034          mutex_enter(&pp->p_zone->zone_mem_lock);
5990 6035          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5991 6036          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5992 6037          pp->p_zone->zone_max_swap -= swap;
5993 6038          mutex_exit(&pp->p_zone->zone_mem_lock);
5994 6039  
5995 6040          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5996 6041          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5997 6042          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5998 6043  
5999 6044          pp->p_flag |= SZONETOP;
6000 6045          pp->p_zone = zone;
6001 6046          mutex_exit(&pp->p_lock);
6002 6047          AS_LOCK_EXIT(pp->p_as);
6003 6048  
6004 6049          /*
6005 6050           * Joining the zone cannot fail from now on.
6006 6051           *
6007 6052           * This means that a lot of the following code can be commonized and
6008 6053           * shared with zsched().
6009 6054           */
6010 6055  
6011 6056          /*
6012 6057           * If the process contract fmri was inherited, we need to
6013 6058           * flag this so that any contract status will not leak
6014 6059           * extra zone information, svc_fmri in this case
6015 6060           */
6016 6061          if (ctp->conp_svc_ctid != ct->ct_id) {
6017 6062                  mutex_enter(&ct->ct_lock);
6018 6063                  ctp->conp_svc_zone_enter = ct->ct_id;
6019 6064                  mutex_exit(&ct->ct_lock);
6020 6065          }
6021 6066  
6022 6067          /*
6023 6068           * Reset the encapsulating process contract's zone.
6024 6069           */
6025 6070          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6026 6071          contract_setzuniqid(ct, zone->zone_uniqid);
6027 6072  
6028 6073          /*
6029 6074           * Create a new task and associate the process with the project keyed
6030 6075           * by (projid,zoneid).
6031 6076           *
6032 6077           * We might as well be in project 0; the global zone's projid doesn't
6033 6078           * make much sense in a zone anyhow.
6034 6079           *
6035 6080           * This also increments zone_ntasks, and returns with p_lock held.
6036 6081           */
6037 6082          tk = task_create(0, zone);
6038 6083          oldtk = task_join(tk, 0);
6039 6084          mutex_exit(&cpu_lock);
6040 6085  
6041 6086          /*
6042 6087           * call RCTLOP_SET functions on this proc
6043 6088           */
6044 6089          e.rcep_p.zone = zone;
6045 6090          e.rcep_t = RCENTITY_ZONE;
6046 6091          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6047 6092              RCD_CALLBACK);
6048 6093          mutex_exit(&pp->p_lock);
6049 6094  
6050 6095          /*
6051 6096           * We don't need to hold any of zsched's locks here; not only do we know
6052 6097           * the process and zone aren't going away, we know its session isn't
6053 6098           * changing either.
6054 6099           *
6055 6100           * By joining zsched's session here, we mimic the behavior in the
6056 6101           * global zone of init's sid being the pid of sched.  We extend this
6057 6102           * to all zlogin-like zone_enter()'ing processes as well.
6058 6103           */
6059 6104          mutex_enter(&pidlock);
6060 6105          sp = zone->zone_zsched->p_sessp;
6061 6106          sess_hold(zone->zone_zsched);
6062 6107          mutex_enter(&pp->p_lock);
6063 6108          pgexit(pp);
6064 6109          sess_rele(pp->p_sessp, B_TRUE);
6065 6110          pp->p_sessp = sp;
6066 6111          pgjoin(pp, zone->zone_zsched->p_pidp);
6067 6112  
6068 6113          /*
6069 6114           * If any threads are scheduled to be placed on zone wait queue they
6070 6115           * should abandon the idea since the wait queue is changing.
6071 6116           * We need to be holding pidlock & p_lock to do this.
6072 6117           */
6073 6118          if ((t = pp->p_tlist) != NULL) {
6074 6119                  do {
6075 6120                          thread_lock(t);
6076 6121                          /*
6077 6122                           * Kick this thread so that he doesn't sit
6078 6123                           * on a wrong wait queue.
6079 6124                           */
6080 6125                          if (ISWAITING(t))
6081 6126                                  setrun_locked(t);
6082 6127  
6083 6128                          if (t->t_schedflag & TS_ANYWAITQ)
6084 6129                                  t->t_schedflag &= ~ TS_ANYWAITQ;
6085 6130  
6086 6131                          thread_unlock(t);
6087 6132                  } while ((t = t->t_forw) != pp->p_tlist);
6088 6133          }
6089 6134  
6090 6135          /*
6091 6136           * If there is a default scheduling class for the zone and it is not
6092 6137           * the class we are currently in, change all of the threads in the
6093 6138           * process to the new class.  We need to be holding pidlock & p_lock
6094 6139           * when we call parmsset so this is a good place to do it.
6095 6140           */
6096 6141          if (zone->zone_defaultcid > 0 &&
6097 6142              zone->zone_defaultcid != curthread->t_cid) {
6098 6143                  pcparms_t pcparms;
6099 6144  
6100 6145                  pcparms.pc_cid = zone->zone_defaultcid;
6101 6146                  pcparms.pc_clparms[0] = 0;
6102 6147  
6103 6148                  /*
6104 6149                   * If setting the class fails, we still want to enter the zone.
6105 6150                   */
6106 6151                  if ((t = pp->p_tlist) != NULL) {
6107 6152                          do {
6108 6153                                  (void) parmsset(&pcparms, t);
6109 6154                          } while ((t = t->t_forw) != pp->p_tlist);
6110 6155                  }
6111 6156          }
6112 6157  
6113 6158          mutex_exit(&pp->p_lock);
6114 6159          mutex_exit(&pidlock);
6115 6160  
6116 6161          mutex_exit(&zonehash_lock);
6117 6162          /*
6118 6163           * We're firmly in the zone; let pools progress.
6119 6164           */
6120 6165          pool_unlock();
6121 6166          task_rele(oldtk);
6122 6167          /*
6123 6168           * We don't need to retain a hold on the zone since we already
6124 6169           * incremented zone_ntasks, so the zone isn't going anywhere.
6125 6170           */
6126 6171          zone_rele(zone);
6127 6172  
6128 6173          /*
6129 6174           * Chroot
6130 6175           */
6131 6176          vp = zone->zone_rootvp;
6132 6177          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6133 6178          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6134 6179  
6135 6180          /*
6136 6181           * Change process credentials
6137 6182           */
6138 6183          newcr = cralloc();
6139 6184          mutex_enter(&pp->p_crlock);
6140 6185          cr = pp->p_cred;
6141 6186          crcopy_to(cr, newcr);
6142 6187          crsetzone(newcr, zone);
6143 6188          pp->p_cred = newcr;
6144 6189  
6145 6190          /*
6146 6191           * Restrict all process privilege sets to zone limit
6147 6192           */
6148 6193          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6149 6194          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6150 6195          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6151 6196          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6152 6197          mutex_exit(&pp->p_crlock);
6153 6198          crset(pp, newcr);
6154 6199  
6155 6200          /*
6156 6201           * Adjust upcount to reflect zone entry.
6157 6202           */
6158 6203          uid = crgetruid(newcr);
6159 6204          mutex_enter(&pidlock);
6160 6205          upcount_dec(uid, GLOBAL_ZONEID);
6161 6206          upcount_inc(uid, zoneid);
6162 6207          mutex_exit(&pidlock);
6163 6208  
6164 6209          /*
6165 6210           * Set up core file path and content.
6166 6211           */
6167 6212          set_core_defaults();
6168 6213  
6169 6214  out:
6170 6215          /*
6171 6216           * Let the other lwps continue.
6172 6217           */
6173 6218          mutex_enter(&pp->p_lock);
6174 6219          if (curthread != pp->p_agenttp)
6175 6220                  continuelwps(pp);
6176 6221          mutex_exit(&pp->p_lock);
6177 6222  
6178 6223          return (err != 0 ? set_errno(err) : 0);
6179 6224  }
6180 6225  
6181 6226  /*
6182 6227   * Systemcall entry point for zone_list(2).
6183 6228   *
6184 6229   * Processes running in a (non-global) zone only see themselves.
6185 6230   * On labeled systems, they see all zones whose label they dominate.
6186 6231   */
6187 6232  static int
6188 6233  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6189 6234  {
6190 6235          zoneid_t *zoneids;
6191 6236          zone_t *zone, *myzone;
6192 6237          uint_t user_nzones, real_nzones;
6193 6238          uint_t domi_nzones;
6194 6239          int error;
6195 6240  
6196 6241          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6197 6242                  return (set_errno(EFAULT));
6198 6243  
6199 6244          myzone = curproc->p_zone;
6200 6245          if (myzone != global_zone) {
6201 6246                  bslabel_t *mybslab;
6202 6247  
6203 6248                  if (!is_system_labeled()) {
6204 6249                          /* just return current zone */
6205 6250                          real_nzones = domi_nzones = 1;
6206 6251                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6207 6252                          zoneids[0] = myzone->zone_id;
6208 6253                  } else {
6209 6254                          /* return all zones that are dominated */
6210 6255                          mutex_enter(&zonehash_lock);
6211 6256                          real_nzones = zonecount;
6212 6257                          domi_nzones = 0;
6213 6258                          if (real_nzones > 0) {
6214 6259                                  zoneids = kmem_alloc(real_nzones *
6215 6260                                      sizeof (zoneid_t), KM_SLEEP);
6216 6261                                  mybslab = label2bslabel(myzone->zone_slabel);
6217 6262                                  for (zone = list_head(&zone_active);
6218 6263                                      zone != NULL;
6219 6264                                      zone = list_next(&zone_active, zone)) {
6220 6265                                          if (zone->zone_id == GLOBAL_ZONEID)
6221 6266                                                  continue;
6222 6267                                          if (zone != myzone &&
6223 6268                                              (zone->zone_flags & ZF_IS_SCRATCH))
6224 6269                                                  continue;
6225 6270                                          /*
6226 6271                                           * Note that a label always dominates
6227 6272                                           * itself, so myzone is always included
6228 6273                                           * in the list.
6229 6274                                           */
6230 6275                                          if (bldominates(mybslab,
6231 6276                                              label2bslabel(zone->zone_slabel))) {
6232 6277                                                  zoneids[domi_nzones++] =
6233 6278                                                      zone->zone_id;
6234 6279                                          }
6235 6280                                  }
6236 6281                          }
6237 6282                          mutex_exit(&zonehash_lock);
6238 6283                  }
6239 6284          } else {
6240 6285                  mutex_enter(&zonehash_lock);
6241 6286                  real_nzones = zonecount;
6242 6287                  domi_nzones = 0;
6243 6288                  if (real_nzones > 0) {
6244 6289                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6245 6290                              KM_SLEEP);
6246 6291                          for (zone = list_head(&zone_active); zone != NULL;
6247 6292                              zone = list_next(&zone_active, zone))
6248 6293                                  zoneids[domi_nzones++] = zone->zone_id;
6249 6294                          ASSERT(domi_nzones == real_nzones);
6250 6295                  }
6251 6296                  mutex_exit(&zonehash_lock);
6252 6297          }
6253 6298  
6254 6299          /*
6255 6300           * If user has allocated space for fewer entries than we found, then
6256 6301           * return only up to his limit.  Either way, tell him exactly how many
6257 6302           * we found.
6258 6303           */
6259 6304          if (domi_nzones < user_nzones)
6260 6305                  user_nzones = domi_nzones;
6261 6306          error = 0;
6262 6307          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6263 6308                  error = EFAULT;
6264 6309          } else if (zoneidlist != NULL && user_nzones != 0) {
6265 6310                  if (copyout(zoneids, zoneidlist,
6266 6311                      user_nzones * sizeof (zoneid_t)) != 0)
6267 6312                          error = EFAULT;
6268 6313          }
6269 6314  
6270 6315          if (real_nzones > 0)
6271 6316                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6272 6317  
6273 6318          if (error != 0)
6274 6319                  return (set_errno(error));
6275 6320          else
6276 6321                  return (0);
6277 6322  }
6278 6323  
6279 6324  /*
6280 6325   * Systemcall entry point for zone_lookup(2).
6281 6326   *
6282 6327   * Non-global zones are only able to see themselves and (on labeled systems)
6283 6328   * the zones they dominate.
6284 6329   */
6285 6330  static zoneid_t
6286 6331  zone_lookup(const char *zone_name)
6287 6332  {
6288 6333          char *kname;
6289 6334          zone_t *zone;
6290 6335          zoneid_t zoneid;
6291 6336          int err;
6292 6337  
6293 6338          if (zone_name == NULL) {
6294 6339                  /* return caller's zone id */
6295 6340                  return (getzoneid());
6296 6341          }
6297 6342  
6298 6343          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6299 6344          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6300 6345                  kmem_free(kname, ZONENAME_MAX);
6301 6346                  return (set_errno(err));
6302 6347          }
6303 6348  
6304 6349          mutex_enter(&zonehash_lock);
6305 6350          zone = zone_find_all_by_name(kname);
6306 6351          kmem_free(kname, ZONENAME_MAX);
6307 6352          /*
6308 6353           * In a non-global zone, can only lookup global and own name.
6309 6354           * In Trusted Extensions zone label dominance rules apply.
6310 6355           */
6311 6356          if (zone == NULL ||
6312 6357              zone_status_get(zone) < ZONE_IS_READY ||
6313 6358              !zone_list_access(zone)) {
6314 6359                  mutex_exit(&zonehash_lock);
6315 6360                  return (set_errno(EINVAL));
6316 6361          } else {
6317 6362                  zoneid = zone->zone_id;
6318 6363                  mutex_exit(&zonehash_lock);
6319 6364                  return (zoneid);
6320 6365          }
6321 6366  }
6322 6367  
6323 6368  static int
6324 6369  zone_version(int *version_arg)
6325 6370  {
6326 6371          int version = ZONE_SYSCALL_API_VERSION;
6327 6372  
6328 6373          if (copyout(&version, version_arg, sizeof (int)) != 0)
6329 6374                  return (set_errno(EFAULT));
6330 6375          return (0);
6331 6376  }
6332 6377  
6333 6378  /* ARGSUSED */
6334 6379  long
6335 6380  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6336 6381  {
6337 6382          zone_def zs;
6338 6383          int err;
6339 6384  
6340 6385          switch (cmd) {
6341 6386          case ZONE_CREATE:
6342 6387                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6343 6388                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6344 6389                                  return (set_errno(EFAULT));
6345 6390                          }
6346 6391                  } else {
6347 6392  #ifdef _SYSCALL32_IMPL
6348 6393                          zone_def32 zs32;
6349 6394  
6350 6395                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6351 6396                                  return (set_errno(EFAULT));
6352 6397                          }
6353 6398                          zs.zone_name =
6354 6399                              (const char *)(unsigned long)zs32.zone_name;
6355 6400                          zs.zone_root =
6356 6401                              (const char *)(unsigned long)zs32.zone_root;
6357 6402                          zs.zone_privs =
6358 6403                              (const struct priv_set *)
6359 6404                              (unsigned long)zs32.zone_privs;
6360 6405                          zs.zone_privssz = zs32.zone_privssz;
6361 6406                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6362 6407                          zs.rctlbufsz = zs32.rctlbufsz;
6363 6408                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6364 6409                          zs.zfsbufsz = zs32.zfsbufsz;
6365 6410                          zs.extended_error =
6366 6411                              (int *)(unsigned long)zs32.extended_error;
6367 6412                          zs.match = zs32.match;
6368 6413                          zs.doi = zs32.doi;
6369 6414                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6370 6415                          zs.flags = zs32.flags;
6371 6416  #else
6372 6417                          panic("get_udatamodel() returned bogus result\n");
6373 6418  #endif
6374 6419                  }
6375 6420  
6376 6421                  return (zone_create(zs.zone_name, zs.zone_root,
6377 6422                      zs.zone_privs, zs.zone_privssz,
6378 6423                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6379 6424                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6380 6425                      zs.extended_error, zs.match, zs.doi,
6381 6426                      zs.label, zs.flags));
6382 6427          case ZONE_BOOT:
6383 6428                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6384 6429          case ZONE_DESTROY:
6385 6430                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6386 6431          case ZONE_GETATTR:
6387 6432                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6388 6433                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6389 6434          case ZONE_SETATTR:
6390 6435                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6391 6436                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6392 6437          case ZONE_ENTER:
6393 6438                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6394 6439          case ZONE_LIST:
6395 6440                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6396 6441          case ZONE_SHUTDOWN:
6397 6442                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6398 6443          case ZONE_LOOKUP:
6399 6444                  return (zone_lookup((const char *)arg1));
6400 6445          case ZONE_VERSION:
6401 6446                  return (zone_version((int *)arg1));
6402 6447          case ZONE_ADD_DATALINK:
6403 6448                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6404 6449                      (datalink_id_t)(uintptr_t)arg2));
6405 6450          case ZONE_DEL_DATALINK:
6406 6451                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6407 6452                      (datalink_id_t)(uintptr_t)arg2));
6408 6453          case ZONE_CHECK_DATALINK: {
6409 6454                  zoneid_t        zoneid;
6410 6455                  boolean_t       need_copyout;
6411 6456  
6412 6457                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6413 6458                          return (EFAULT);
6414 6459                  need_copyout = (zoneid == ALL_ZONES);
6415 6460                  err = zone_check_datalink(&zoneid,
6416 6461                      (datalink_id_t)(uintptr_t)arg2);
6417 6462                  if (err == 0 && need_copyout) {
6418 6463                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6419 6464                                  err = EFAULT;
6420 6465                  }
6421 6466                  return (err == 0 ? 0 : set_errno(err));
6422 6467          }
6423 6468          case ZONE_LIST_DATALINK:
6424 6469                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6425 6470                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6426 6471          default:
6427 6472                  return (set_errno(EINVAL));
6428 6473          }
6429 6474  }
6430 6475  
6431 6476  struct zarg {
6432 6477          zone_t *zone;
6433 6478          zone_cmd_arg_t arg;
6434 6479  };
6435 6480  
6436 6481  static int
6437 6482  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6438 6483  {
6439 6484          char *buf;
6440 6485          size_t buflen;
6441 6486          int error;
6442 6487  
6443 6488          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6444 6489          buf = kmem_alloc(buflen, KM_SLEEP);
6445 6490          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6446 6491          error = door_ki_open(buf, doorp);
6447 6492          kmem_free(buf, buflen);
6448 6493          return (error);
6449 6494  }
6450 6495  
6451 6496  static void
6452 6497  zone_release_door(door_handle_t *doorp)
6453 6498  {
6454 6499          door_ki_rele(*doorp);
6455 6500          *doorp = NULL;
6456 6501  }
6457 6502  
6458 6503  static void
6459 6504  zone_ki_call_zoneadmd(struct zarg *zargp)
6460 6505  {
6461 6506          door_handle_t door = NULL;
6462 6507          door_arg_t darg, save_arg;
6463 6508          char *zone_name;
6464 6509          size_t zone_namelen;
6465 6510          zoneid_t zoneid;
6466 6511          zone_t *zone;
6467 6512          zone_cmd_arg_t arg;
6468 6513          uint64_t uniqid;
6469 6514          size_t size;
6470 6515          int error;
6471 6516          int retry;
6472 6517  
6473 6518          zone = zargp->zone;
6474 6519          arg = zargp->arg;
6475 6520          kmem_free(zargp, sizeof (*zargp));
6476 6521  
6477 6522          zone_namelen = strlen(zone->zone_name) + 1;
6478 6523          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6479 6524          bcopy(zone->zone_name, zone_name, zone_namelen);
6480 6525          zoneid = zone->zone_id;
6481 6526          uniqid = zone->zone_uniqid;
6482 6527          /*
6483 6528           * zoneadmd may be down, but at least we can empty out the zone.
6484 6529           * We can ignore the return value of zone_empty() since we're called
6485 6530           * from a kernel thread and know we won't be delivered any signals.
6486 6531           */
6487 6532          ASSERT(curproc == &p0);
6488 6533          (void) zone_empty(zone);
6489 6534          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6490 6535          zone_rele(zone);
6491 6536  
6492 6537          size = sizeof (arg);
6493 6538          darg.rbuf = (char *)&arg;
6494 6539          darg.data_ptr = (char *)&arg;
6495 6540          darg.rsize = size;
6496 6541          darg.data_size = size;
6497 6542          darg.desc_ptr = NULL;
6498 6543          darg.desc_num = 0;
6499 6544  
6500 6545          save_arg = darg;
6501 6546          /*
6502 6547           * Since we're not holding a reference to the zone, any number of
6503 6548           * things can go wrong, including the zone disappearing before we get a
6504 6549           * chance to talk to zoneadmd.
6505 6550           */
6506 6551          for (retry = 0; /* forever */; retry++) {
6507 6552                  if (door == NULL &&
6508 6553                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6509 6554                          goto next;
6510 6555                  }
6511 6556                  ASSERT(door != NULL);
6512 6557  
6513 6558                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6514 6559                      SIZE_MAX, 0)) == 0) {
6515 6560                          break;
6516 6561                  }
6517 6562                  switch (error) {
6518 6563                  case EINTR:
6519 6564                          /* FALLTHROUGH */
6520 6565                  case EAGAIN:    /* process may be forking */
6521 6566                          /*
6522 6567                           * Back off for a bit
6523 6568                           */
6524 6569                          break;
6525 6570                  case EBADF:
6526 6571                          zone_release_door(&door);
6527 6572                          if (zone_lookup_door(zone_name, &door) != 0) {
6528 6573                                  /*
6529 6574                                   * zoneadmd may be dead, but it may come back to
6530 6575                                   * life later.
6531 6576                                   */
6532 6577                                  break;
6533 6578                          }
6534 6579                          break;
6535 6580                  default:
6536 6581                          cmn_err(CE_WARN,
6537 6582                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6538 6583                              error);
6539 6584                          goto out;
6540 6585                  }
6541 6586  next:
6542 6587                  /*
6543 6588                   * If this isn't the same zone_t that we originally had in mind,
6544 6589                   * then this is the same as if two kadmin requests come in at
6545 6590                   * the same time: the first one wins.  This means we lose, so we
6546 6591                   * bail.
6547 6592                   */
6548 6593                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6549 6594                          /*
6550 6595                           * Problem is solved.
6551 6596                           */
6552 6597                          break;
6553 6598                  }
6554 6599                  if (zone->zone_uniqid != uniqid) {
6555 6600                          /*
6556 6601                           * zoneid recycled
6557 6602                           */
6558 6603                          zone_rele(zone);
6559 6604                          break;
6560 6605                  }
6561 6606                  /*
6562 6607                   * We could zone_status_timedwait(), but there doesn't seem to
6563 6608                   * be much point in doing that (plus, it would mean that
6564 6609                   * zone_free() isn't called until this thread exits).
6565 6610                   */
6566 6611                  zone_rele(zone);
6567 6612                  delay(hz);
6568 6613                  darg = save_arg;
6569 6614          }
6570 6615  out:
6571 6616          if (door != NULL) {
6572 6617                  zone_release_door(&door);
6573 6618          }
6574 6619          kmem_free(zone_name, zone_namelen);
6575 6620          thread_exit();
6576 6621  }
6577 6622  
6578 6623  /*
6579 6624   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6580 6625   * kadmin().  The caller is a process in the zone.
6581 6626   *
6582 6627   * In order to shutdown the zone, we will hand off control to zoneadmd
6583 6628   * (running in the global zone) via a door.  We do a half-hearted job at
6584 6629   * killing all processes in the zone, create a kernel thread to contact
6585 6630   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6586 6631   * a form of generation number used to let zoneadmd (as well as
6587 6632   * zone_destroy()) know exactly which zone they're re talking about.
6588 6633   */
6589 6634  int
6590 6635  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6591 6636  {
6592 6637          struct zarg *zargp;
6593 6638          zone_cmd_t zcmd;
6594 6639          zone_t *zone;
6595 6640  
6596 6641          zone = curproc->p_zone;
6597 6642          ASSERT(getzoneid() != GLOBAL_ZONEID);
6598 6643  
6599 6644          switch (cmd) {
6600 6645          case A_SHUTDOWN:
6601 6646                  switch (fcn) {
6602 6647                  case AD_HALT:
6603 6648                  case AD_POWEROFF:
6604 6649                          zcmd = Z_HALT;
6605 6650                          break;
6606 6651                  case AD_BOOT:
6607 6652                          zcmd = Z_REBOOT;
6608 6653                          break;
6609 6654                  case AD_IBOOT:
6610 6655                  case AD_SBOOT:
6611 6656                  case AD_SIBOOT:
6612 6657                  case AD_NOSYNC:
6613 6658                          return (ENOTSUP);
6614 6659                  default:
6615 6660                          return (EINVAL);
6616 6661                  }
6617 6662                  break;
6618 6663          case A_REBOOT:
6619 6664                  zcmd = Z_REBOOT;
6620 6665                  break;
6621 6666          case A_FTRACE:
6622 6667          case A_REMOUNT:
6623 6668          case A_FREEZE:
6624 6669          case A_DUMP:
6625 6670          case A_CONFIG:
6626 6671                  return (ENOTSUP);
6627 6672          default:
6628 6673                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6629 6674                  return (EINVAL);
6630 6675          }
6631 6676  
6632 6677          if (secpolicy_zone_admin(credp, B_FALSE))
6633 6678                  return (EPERM);
6634 6679          mutex_enter(&zone_status_lock);
6635 6680  
6636 6681          /*
6637 6682           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6638 6683           * is in the zone.
6639 6684           */
6640 6685          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6641 6686          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6642 6687                  /*
6643 6688                   * This zone is already on its way down.
6644 6689                   */
6645 6690                  mutex_exit(&zone_status_lock);
6646 6691                  return (0);
6647 6692          }
6648 6693          /*
6649 6694           * Prevent future zone_enter()s
6650 6695           */
6651 6696          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6652 6697          mutex_exit(&zone_status_lock);
6653 6698  
6654 6699          /*
6655 6700           * Kill everyone now and call zoneadmd later.
6656 6701           * zone_ki_call_zoneadmd() will do a more thorough job of this
6657 6702           * later.
6658 6703           */
6659 6704          killall(zone->zone_id);
6660 6705          /*
6661 6706           * Now, create the thread to contact zoneadmd and do the rest of the
6662 6707           * work.  This thread can't be created in our zone otherwise
6663 6708           * zone_destroy() would deadlock.
6664 6709           */
6665 6710          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6666 6711          zargp->arg.cmd = zcmd;
6667 6712          zargp->arg.uniqid = zone->zone_uniqid;
6668 6713          zargp->zone = zone;
6669 6714          (void) strcpy(zargp->arg.locale, "C");
6670 6715          /* mdep was already copied in for us by uadmin */
6671 6716          if (mdep != NULL)
6672 6717                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6673 6718                      sizeof (zargp->arg.bootbuf));
6674 6719          zone_hold(zone);
6675 6720  
6676 6721          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6677 6722              TS_RUN, minclsyspri);
6678 6723          exit(CLD_EXITED, 0);
6679 6724  
6680 6725          return (EINVAL);
6681 6726  }
6682 6727  
6683 6728  /*
6684 6729   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6685 6730   * status to ZONE_IS_SHUTTING_DOWN.
6686 6731   *
6687 6732   * This function also shuts down all running zones to ensure that they won't
6688 6733   * fork new processes.
6689 6734   */
6690 6735  void
6691 6736  zone_shutdown_global(void)
6692 6737  {
6693 6738          zone_t *current_zonep;
6694 6739  
6695 6740          ASSERT(INGLOBALZONE(curproc));
6696 6741          mutex_enter(&zonehash_lock);
6697 6742          mutex_enter(&zone_status_lock);
6698 6743  
6699 6744          /* Modify the global zone's status first. */
6700 6745          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6701 6746          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6702 6747  
6703 6748          /*
6704 6749           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6705 6750           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6706 6751           * could cause assertions to fail (e.g., assertions about a zone's
6707 6752           * state during initialization, readying, or booting) or produce races.
6708 6753           * We'll let threads continue to initialize and ready new zones: they'll
6709 6754           * fail to boot the new zones when they see that the global zone is
6710 6755           * shutting down.
6711 6756           */
6712 6757          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6713 6758              current_zonep = list_next(&zone_active, current_zonep)) {
6714 6759                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6715 6760                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6716 6761          }
6717 6762          mutex_exit(&zone_status_lock);
6718 6763          mutex_exit(&zonehash_lock);
6719 6764  }
6720 6765  
6721 6766  /*
6722 6767   * Returns true if the named dataset is visible in the current zone.
6723 6768   * The 'write' parameter is set to 1 if the dataset is also writable.
6724 6769   */
6725 6770  int
6726 6771  zone_dataset_visible(const char *dataset, int *write)
6727 6772  {
6728 6773          static int zfstype = -1;
6729 6774          zone_dataset_t *zd;
6730 6775          size_t len;
6731 6776          zone_t *zone = curproc->p_zone;
6732 6777          const char *name = NULL;
6733 6778          vfs_t *vfsp = NULL;
6734 6779  
6735 6780          if (dataset[0] == '\0')
6736 6781                  return (0);
6737 6782  
6738 6783          /*
6739 6784           * Walk the list once, looking for datasets which match exactly, or
6740 6785           * specify a dataset underneath an exported dataset.  If found, return
6741 6786           * true and note that it is writable.
6742 6787           */
6743 6788          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6744 6789              zd = list_next(&zone->zone_datasets, zd)) {
6745 6790  
6746 6791                  len = strlen(zd->zd_dataset);
6747 6792                  if (strlen(dataset) >= len &&
6748 6793                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6749 6794                      (dataset[len] == '\0' || dataset[len] == '/' ||
6750 6795                      dataset[len] == '@')) {
6751 6796                          if (write)
6752 6797                                  *write = 1;
6753 6798                          return (1);
6754 6799                  }
6755 6800          }
6756 6801  
6757 6802          /*
6758 6803           * Walk the list a second time, searching for datasets which are parents
6759 6804           * of exported datasets.  These should be visible, but read-only.
6760 6805           *
6761 6806           * Note that we also have to support forms such as 'pool/dataset/', with
6762 6807           * a trailing slash.
6763 6808           */
6764 6809          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6765 6810              zd = list_next(&zone->zone_datasets, zd)) {
6766 6811  
6767 6812                  len = strlen(dataset);
6768 6813                  if (dataset[len - 1] == '/')
6769 6814                          len--;  /* Ignore trailing slash */
6770 6815                  if (len < strlen(zd->zd_dataset) &&
6771 6816                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6772 6817                      zd->zd_dataset[len] == '/') {
6773 6818                          if (write)
6774 6819                                  *write = 0;
6775 6820                          return (1);
6776 6821                  }
6777 6822          }
6778 6823  
6779 6824          /*
6780 6825           * We reach here if the given dataset is not found in the zone_dataset
6781 6826           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6782 6827           * instead of delegation. For this we search for the dataset in the
6783 6828           * zone_vfslist of this zone. If found, return true and note that it is
6784 6829           * not writable.
6785 6830           */
6786 6831  
6787 6832          /*
6788 6833           * Initialize zfstype if it is not initialized yet.
6789 6834           */
6790 6835          if (zfstype == -1) {
6791 6836                  struct vfssw *vswp = vfs_getvfssw("zfs");
6792 6837                  zfstype = vswp - vfssw;
6793 6838                  vfs_unrefvfssw(vswp);
6794 6839          }
6795 6840  
6796 6841          vfs_list_read_lock();
6797 6842          vfsp = zone->zone_vfslist;
6798 6843          do {
6799 6844                  ASSERT(vfsp);
6800 6845                  if (vfsp->vfs_fstype == zfstype) {
6801 6846                          name = refstr_value(vfsp->vfs_resource);
6802 6847  
6803 6848                          /*
6804 6849                           * Check if we have an exact match.
6805 6850                           */
6806 6851                          if (strcmp(dataset, name) == 0) {
6807 6852                                  vfs_list_unlock();
6808 6853                                  if (write)
6809 6854                                          *write = 0;
6810 6855                                  return (1);
6811 6856                          }
6812 6857                          /*
6813 6858                           * We need to check if we are looking for parents of
6814 6859                           * a dataset. These should be visible, but read-only.
6815 6860                           */
6816 6861                          len = strlen(dataset);
6817 6862                          if (dataset[len - 1] == '/')
6818 6863                                  len--;
6819 6864  
6820 6865                          if (len < strlen(name) &&
6821 6866                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6822 6867                                  vfs_list_unlock();
6823 6868                                  if (write)
6824 6869                                          *write = 0;
6825 6870                                  return (1);
6826 6871                          }
6827 6872                  }
6828 6873                  vfsp = vfsp->vfs_zone_next;
6829 6874          } while (vfsp != zone->zone_vfslist);
6830 6875  
6831 6876          vfs_list_unlock();
6832 6877          return (0);
6833 6878  }
6834 6879  
6835 6880  /*
6836 6881   * zone_find_by_any_path() -
6837 6882   *
6838 6883   * kernel-private routine similar to zone_find_by_path(), but which
6839 6884   * effectively compares against zone paths rather than zonerootpath
6840 6885   * (i.e., the last component of zonerootpaths, which should be "root/",
6841 6886   * are not compared.)  This is done in order to accurately identify all
6842 6887   * paths, whether zone-visible or not, including those which are parallel
6843 6888   * to /root/, such as /dev/, /home/, etc...
6844 6889   *
6845 6890   * If the specified path does not fall under any zone path then global
6846 6891   * zone is returned.
6847 6892   *
6848 6893   * The treat_abs parameter indicates whether the path should be treated as
6849 6894   * an absolute path although it does not begin with "/".  (This supports
6850 6895   * nfs mount syntax such as host:any/path.)
6851 6896   *
6852 6897   * The caller is responsible for zone_rele of the returned zone.
6853 6898   */
6854 6899  zone_t *
6855 6900  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6856 6901  {
6857 6902          zone_t *zone;
6858 6903          int path_offset = 0;
6859 6904  
6860 6905          if (path == NULL) {
6861 6906                  zone_hold(global_zone);
6862 6907                  return (global_zone);
6863 6908          }
6864 6909  
6865 6910          if (*path != '/') {
6866 6911                  ASSERT(treat_abs);
6867 6912                  path_offset = 1;
6868 6913          }
6869 6914  
6870 6915          mutex_enter(&zonehash_lock);
6871 6916          for (zone = list_head(&zone_active); zone != NULL;
6872 6917              zone = list_next(&zone_active, zone)) {
6873 6918                  char    *c;
6874 6919                  size_t  pathlen;
6875 6920                  char *rootpath_start;
6876 6921  
6877 6922                  if (zone == global_zone)        /* skip global zone */
6878 6923                          continue;
6879 6924  
6880 6925                  /* scan backwards to find start of last component */
6881 6926                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6882 6927                  do {
6883 6928                          c--;
6884 6929                  } while (*c != '/');
6885 6930  
6886 6931                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
6887 6932                  rootpath_start = (zone->zone_rootpath + path_offset);
6888 6933                  if (strncmp(path, rootpath_start, pathlen) == 0)
6889 6934                          break;
6890 6935          }
6891 6936          if (zone == NULL)
6892 6937                  zone = global_zone;
6893 6938          zone_hold(zone);
6894 6939          mutex_exit(&zonehash_lock);
6895 6940          return (zone);
6896 6941  }
6897 6942  
6898 6943  /*
6899 6944   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6900 6945   * zone_dl_t pointer if found, and NULL otherwise.
6901 6946   */
6902 6947  static zone_dl_t *
6903 6948  zone_find_dl(zone_t *zone, datalink_id_t linkid)
6904 6949  {
6905 6950          zone_dl_t *zdl;
6906 6951  
6907 6952          ASSERT(mutex_owned(&zone->zone_lock));
6908 6953          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6909 6954              zdl = list_next(&zone->zone_dl_list, zdl)) {
6910 6955                  if (zdl->zdl_id == linkid)
6911 6956                          break;
6912 6957          }
6913 6958          return (zdl);
6914 6959  }
6915 6960  
6916 6961  static boolean_t
6917 6962  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6918 6963  {
6919 6964          boolean_t exists;
6920 6965  
6921 6966          mutex_enter(&zone->zone_lock);
6922 6967          exists = (zone_find_dl(zone, linkid) != NULL);
6923 6968          mutex_exit(&zone->zone_lock);
6924 6969          return (exists);
6925 6970  }
6926 6971  
6927 6972  /*
6928 6973   * Add an data link name for the zone.
6929 6974   */
6930 6975  static int
6931 6976  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6932 6977  {
6933 6978          zone_dl_t *zdl;
6934 6979          zone_t *zone;
6935 6980          zone_t *thiszone;
6936 6981  
6937 6982          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6938 6983                  return (set_errno(ENXIO));
6939 6984  
6940 6985          /* Verify that the datalink ID doesn't already belong to a zone. */
6941 6986          mutex_enter(&zonehash_lock);
6942 6987          for (zone = list_head(&zone_active); zone != NULL;
6943 6988              zone = list_next(&zone_active, zone)) {
6944 6989                  if (zone_dl_exists(zone, linkid)) {
6945 6990                          mutex_exit(&zonehash_lock);
6946 6991                          zone_rele(thiszone);
6947 6992                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6948 6993                  }
6949 6994          }
6950 6995  
6951 6996          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6952 6997          zdl->zdl_id = linkid;
6953 6998          zdl->zdl_net = NULL;
6954 6999          mutex_enter(&thiszone->zone_lock);
6955 7000          list_insert_head(&thiszone->zone_dl_list, zdl);
6956 7001          mutex_exit(&thiszone->zone_lock);
6957 7002          mutex_exit(&zonehash_lock);
6958 7003          zone_rele(thiszone);
6959 7004          return (0);
6960 7005  }
6961 7006  
6962 7007  static int
6963 7008  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6964 7009  {
6965 7010          zone_dl_t *zdl;
6966 7011          zone_t *zone;
6967 7012          int err = 0;
6968 7013  
6969 7014          if ((zone = zone_find_by_id(zoneid)) == NULL)
6970 7015                  return (set_errno(EINVAL));
6971 7016  
6972 7017          mutex_enter(&zone->zone_lock);
6973 7018          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6974 7019                  err = ENXIO;
6975 7020          } else {
6976 7021                  list_remove(&zone->zone_dl_list, zdl);
6977 7022                  nvlist_free(zdl->zdl_net);
6978 7023                  kmem_free(zdl, sizeof (zone_dl_t));
6979 7024          }
6980 7025          mutex_exit(&zone->zone_lock);
6981 7026          zone_rele(zone);
6982 7027          return (err == 0 ? 0 : set_errno(err));
6983 7028  }
6984 7029  
6985 7030  /*
6986 7031   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6987 7032   * the linkid.  Otherwise we just check if the specified zoneidp has been
6988 7033   * assigned the supplied linkid.
6989 7034   */
6990 7035  int
6991 7036  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6992 7037  {
6993 7038          zone_t *zone;
6994 7039          int err = ENXIO;
6995 7040  
6996 7041          if (*zoneidp != ALL_ZONES) {
6997 7042                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6998 7043                          if (zone_dl_exists(zone, linkid))
6999 7044                                  err = 0;
7000 7045                          zone_rele(zone);
7001 7046                  }
7002 7047                  return (err);
7003 7048          }
7004 7049  
7005 7050          mutex_enter(&zonehash_lock);
7006 7051          for (zone = list_head(&zone_active); zone != NULL;
7007 7052              zone = list_next(&zone_active, zone)) {
7008 7053                  if (zone_dl_exists(zone, linkid)) {
7009 7054                          *zoneidp = zone->zone_id;
7010 7055                          err = 0;
7011 7056                          break;
7012 7057                  }
7013 7058          }
7014 7059          mutex_exit(&zonehash_lock);
7015 7060          return (err);
7016 7061  }
7017 7062  
7018 7063  /*
7019 7064   * Get the list of datalink IDs assigned to a zone.
7020 7065   *
7021 7066   * On input, *nump is the number of datalink IDs that can fit in the supplied
7022 7067   * idarray.  Upon return, *nump is either set to the number of datalink IDs
7023 7068   * that were placed in the array if the array was large enough, or to the
7024 7069   * number of datalink IDs that the function needs to place in the array if the
7025 7070   * array is too small.
7026 7071   */
7027 7072  static int
7028 7073  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7029 7074  {
7030 7075          uint_t num, dlcount;
7031 7076          zone_t *zone;
7032 7077          zone_dl_t *zdl;
7033 7078          datalink_id_t *idptr = idarray;
7034 7079  
7035 7080          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7036 7081                  return (set_errno(EFAULT));
7037 7082          if ((zone = zone_find_by_id(zoneid)) == NULL)
7038 7083                  return (set_errno(ENXIO));
7039 7084  
7040 7085          num = 0;
7041 7086          mutex_enter(&zone->zone_lock);
7042 7087          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7043 7088              zdl = list_next(&zone->zone_dl_list, zdl)) {
7044 7089                  /*
7045 7090                   * If the list is bigger than what the caller supplied, just
7046 7091                   * count, don't do copyout.
7047 7092                   */
7048 7093                  if (++num > dlcount)
7049 7094                          continue;
7050 7095                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7051 7096                          mutex_exit(&zone->zone_lock);
7052 7097                          zone_rele(zone);
7053 7098                          return (set_errno(EFAULT));
7054 7099                  }
7055 7100                  idptr++;
7056 7101          }
7057 7102          mutex_exit(&zone->zone_lock);
7058 7103          zone_rele(zone);
7059 7104  
7060 7105          /* Increased or decreased, caller should be notified. */
7061 7106          if (num != dlcount) {
7062 7107                  if (copyout(&num, nump, sizeof (num)) != 0)
7063 7108                          return (set_errno(EFAULT));
7064 7109          }
7065 7110          return (0);
7066 7111  }
7067 7112  
7068 7113  /*
7069 7114   * Public interface for looking up a zone by zoneid. It's a customized version
7070 7115   * for netstack_zone_create(). It can only be called from the zsd create
7071 7116   * callbacks, since it doesn't have reference on the zone structure hence if
7072 7117   * it is called elsewhere the zone could disappear after the zonehash_lock
7073 7118   * is dropped.
7074 7119   *
7075 7120   * Furthermore it
7076 7121   * 1. Doesn't check the status of the zone.
7077 7122   * 2. It will be called even before zone_init is called, in that case the
7078 7123   *    address of zone0 is returned directly, and netstack_zone_create()
7079 7124   *    will only assign a value to zone0.zone_netstack, won't break anything.
7080 7125   * 3. Returns without the zone being held.
7081 7126   */
7082 7127  zone_t *
7083 7128  zone_find_by_id_nolock(zoneid_t zoneid)
7084 7129  {
7085 7130          zone_t *zone;
7086 7131  
7087 7132          mutex_enter(&zonehash_lock);
7088 7133          if (zonehashbyid == NULL)
7089 7134                  zone = &zone0;
7090 7135          else
7091 7136                  zone = zone_find_all_by_id(zoneid);
7092 7137          mutex_exit(&zonehash_lock);
7093 7138          return (zone);
7094 7139  }
7095 7140  
7096 7141  /*
7097 7142   * Walk the datalinks for a given zone
7098 7143   */
7099 7144  int
7100 7145  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7101 7146      void *data)
7102 7147  {
7103 7148          zone_t          *zone;
7104 7149          zone_dl_t       *zdl;
7105 7150          datalink_id_t   *idarray;
7106 7151          uint_t          idcount = 0;
7107 7152          int             i, ret = 0;
7108 7153  
7109 7154          if ((zone = zone_find_by_id(zoneid)) == NULL)
7110 7155                  return (ENOENT);
7111 7156  
7112 7157          /*
7113 7158           * We first build an array of linkid's so that we can walk these and
7114 7159           * execute the callback with the zone_lock dropped.
7115 7160           */
7116 7161          mutex_enter(&zone->zone_lock);
7117 7162          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7118 7163              zdl = list_next(&zone->zone_dl_list, zdl)) {
7119 7164                  idcount++;
7120 7165          }
7121 7166  
7122 7167          if (idcount == 0) {
7123 7168                  mutex_exit(&zone->zone_lock);
7124 7169                  zone_rele(zone);
7125 7170                  return (0);
7126 7171          }
7127 7172  
7128 7173          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7129 7174          if (idarray == NULL) {
7130 7175                  mutex_exit(&zone->zone_lock);
7131 7176                  zone_rele(zone);
7132 7177                  return (ENOMEM);
7133 7178          }
7134 7179  
7135 7180          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7136 7181              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7137 7182                  idarray[i] = zdl->zdl_id;
7138 7183          }
7139 7184  
7140 7185          mutex_exit(&zone->zone_lock);
7141 7186  
7142 7187          for (i = 0; i < idcount && ret == 0; i++) {
7143 7188                  if ((ret = (*cb)(idarray[i], data)) != 0)
7144 7189                          break;
7145 7190          }
7146 7191  
7147 7192          zone_rele(zone);
7148 7193          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7149 7194          return (ret);
7150 7195  }
7151 7196  
7152 7197  static char *
7153 7198  zone_net_type2name(int type)
7154 7199  {
7155 7200          switch (type) {
7156 7201          case ZONE_NETWORK_ADDRESS:
7157 7202                  return (ZONE_NET_ADDRNAME);
7158 7203          case ZONE_NETWORK_DEFROUTER:
7159 7204                  return (ZONE_NET_RTRNAME);
7160 7205          default:
7161 7206                  return (NULL);
7162 7207          }
7163 7208  }
7164 7209  
7165 7210  static int
7166 7211  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7167 7212  {
7168 7213          zone_t *zone;
7169 7214          zone_dl_t *zdl;
7170 7215          nvlist_t *nvl;
7171 7216          int err = 0;
7172 7217          uint8_t *new = NULL;
7173 7218          char *nvname;
7174 7219          int bufsize;
7175 7220          datalink_id_t linkid = znbuf->zn_linkid;
7176 7221  
7177 7222          if (secpolicy_zone_config(CRED()) != 0)
7178 7223                  return (set_errno(EPERM));
7179 7224  
7180 7225          if (zoneid == GLOBAL_ZONEID)
7181 7226                  return (set_errno(EINVAL));
7182 7227  
7183 7228          nvname = zone_net_type2name(znbuf->zn_type);
7184 7229          bufsize = znbuf->zn_len;
7185 7230          new = znbuf->zn_val;
7186 7231          if (nvname == NULL)
7187 7232                  return (set_errno(EINVAL));
7188 7233  
7189 7234          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7190 7235                  return (set_errno(EINVAL));
7191 7236          }
7192 7237  
7193 7238          mutex_enter(&zone->zone_lock);
7194 7239          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7195 7240                  err = ENXIO;
7196 7241                  goto done;
7197 7242          }
7198 7243          if ((nvl = zdl->zdl_net) == NULL) {
7199 7244                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7200 7245                          err = ENOMEM;
7201 7246                          goto done;
7202 7247                  } else {
7203 7248                          zdl->zdl_net = nvl;
7204 7249                  }
7205 7250          }
7206 7251          if (nvlist_exists(nvl, nvname)) {
7207 7252                  err = EINVAL;
7208 7253                  goto done;
7209 7254          }
7210 7255          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7211 7256          ASSERT(err == 0);
7212 7257  done:
7213 7258          mutex_exit(&zone->zone_lock);
7214 7259          zone_rele(zone);
7215 7260          if (err != 0)
7216 7261                  return (set_errno(err));
7217 7262          else
7218 7263                  return (0);
7219 7264  }
7220 7265  
7221 7266  static int
7222 7267  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7223 7268  {
7224 7269          zone_t *zone;
7225 7270          zone_dl_t *zdl;
7226 7271          nvlist_t *nvl;
7227 7272          uint8_t *ptr;
7228 7273          uint_t psize;
7229 7274          int err = 0;
7230 7275          char *nvname;
7231 7276          int bufsize;
7232 7277          void *buf;
7233 7278          datalink_id_t linkid = znbuf->zn_linkid;
7234 7279  
7235 7280          if (zoneid == GLOBAL_ZONEID)
7236 7281                  return (set_errno(EINVAL));
7237 7282  
7238 7283          nvname = zone_net_type2name(znbuf->zn_type);
7239 7284          bufsize = znbuf->zn_len;
7240 7285          buf = znbuf->zn_val;
7241 7286  
7242 7287          if (nvname == NULL)
7243 7288                  return (set_errno(EINVAL));
7244 7289          if ((zone = zone_find_by_id(zoneid)) == NULL)
7245 7290                  return (set_errno(EINVAL));
7246 7291  
7247 7292          mutex_enter(&zone->zone_lock);
7248 7293          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7249 7294                  err = ENXIO;
7250 7295                  goto done;
7251 7296          }
7252 7297          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7253 7298                  err = ENOENT;
7254 7299                  goto done;
7255 7300          }
7256 7301          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7257 7302          ASSERT(err == 0);
7258 7303  
7259 7304          if (psize > bufsize) {
7260 7305                  err = ENOBUFS;
7261 7306                  goto done;
7262 7307          }
7263 7308          znbuf->zn_len = psize;
7264 7309          bcopy(ptr, buf, psize);
7265 7310  done:
7266 7311          mutex_exit(&zone->zone_lock);
7267 7312          zone_rele(zone);
7268 7313          if (err != 0)
7269 7314                  return (set_errno(err));
7270 7315          else
7271 7316                  return (0);
7272 7317  }

↓ open down ↓

1614 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX