il_7029-3 Wdiff usr/src/uts/common/os/zone.c

Print this page

7029 want per-process exploit mitigation features (secflags)
7030 want basic address space layout randomization (aslr)
7031 noexec_user_stack should be a secflag
7032 want a means to forbid mappings around NULL.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015, Joyent Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Zones
  29   29   *
  30   30   *   A zone is a named collection of processes, namespace constraints,
  31   31   *   and other system resources which comprise a secure and manageable
  32   32   *   application containment facility.
  33   33   *
  34   34   *   Zones (represented by the reference counted zone_t) are tracked in
  35   35   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36   36   *   (zoneid_t) are used to track zone association.  Zone IDs are
  37   37   *   dynamically generated when the zone is created; if a persistent
  38   38   *   identifier is needed (core files, accounting logs, audit trail,
  39   39   *   etc.), the zone name should be used.
  40   40   *
  41   41   *
  42   42   *   Global Zone:
  43   43   *
  44   44   *   The global zone (zoneid 0) is automatically associated with all
  45   45   *   system resources that have not been bound to a user-created zone.
  46   46   *   This means that even systems where zones are not in active use
  47   47   *   have a global zone, and all processes, mounts, etc. are
  48   48   *   associated with that zone.  The global zone is generally
  49   49   *   unconstrained in terms of privileges and access, though the usual
  50   50   *   credential and privilege based restrictions apply.
  51   51   *
  52   52   *
  53   53   *   Zone States:
  54   54   *
  55   55   *   The states in which a zone may be in and the transitions are as
  56   56   *   follows:
  57   57   *
  58   58   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59   59   *   initialized zone is added to the list of active zones on the system but
  60   60   *   isn't accessible.
  61   61   *
  62   62   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63   63   *   not yet completed. Not possible to enter the zone, but attributes can
  64   64   *   be retrieved.
  65   65   *
  66   66   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67   67   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68   68   *   executed.  A zone remains in this state until it transitions into
  69   69   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70   70   *
  71   71   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72   72   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73   73   *   state.
  74   74   *
  75   75   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76   76   *   successfully started init.   A zone remains in this state until
  77   77   *   zone_shutdown() is called.
  78   78   *
  79   79   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80   80   *   killing all processes running in the zone. The zone remains
  81   81   *   in this state until there are no more user processes running in the zone.
  82   82   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83   83   *   Since zone_shutdown() is restartable, it may be called successfully
  84   84   *   multiple times for the same zone_t.  Setting of the zone's state to
  85   85   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86   86   *   the zone's status without worrying about it being a moving target.
  87   87   *
  88   88   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89   89   *   are no more user processes in the zone.  The zone remains in this
  90   90   *   state until there are no more kernel threads associated with the
  91   91   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92   92   *   fail.
  93   93   *
  94   94   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95   95   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96   96   *   join the zone or create kernel threads therein.
  97   97   *
  98   98   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99   99   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  100   *   return NULL from now on.
 101  101   *
 102  102   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  103   *   processes or threads doing work on behalf of the zone.  The zone is
 104  104   *   removed from the list of active zones.  zone_destroy() returns, and
 105  105   *   the zone can be recreated.
 106  106   *
 107  107   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  108   *   callbacks are executed, and all memory associated with the zone is
 109  109   *   freed.
 110  110   *
 111  111   *   Threads can wait for the zone to enter a requested state by using
 112  112   *   zone_status_wait() or zone_status_timedwait() with the desired
 113  113   *   state passed in as an argument.  Zone state transitions are
 114  114   *   uni-directional; it is not possible to move back to an earlier state.
 115  115   *
 116  116   *
 117  117   *   Zone-Specific Data:
 118  118   *
 119  119   *   Subsystems needing to maintain zone-specific data can store that
 120  120   *   data using the ZSD mechanism.  This provides a zone-specific data
 121  121   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  122   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  123   *   to register callbacks to be invoked when a zone is created, shut
 124  124   *   down, or destroyed.  This can be used to initialize zone-specific
 125  125   *   data for new zones and to clean up when zones go away.
 126  126   *
 127  127   *
 128  128   *   Data Structures:
 129  129   *
 130  130   *   The per-zone structure (zone_t) is reference counted, and freed
 131  131   *   when all references are released.  zone_hold and zone_rele can be
 132  132   *   used to adjust the reference count.  In addition, reference counts
 133  133   *   associated with the cred_t structure are tracked separately using
 134  134   *   zone_cred_hold and zone_cred_rele.
 135  135   *
 136  136   *   Pointers to active zone_t's are stored in two hash tables; one
 137  137   *   for searching by id, the other for searching by name.  Lookups
 138  138   *   can be performed on either basis, using zone_find_by_id and
 139  139   *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  140   *   held, so zone_rele should be called when the pointer is no longer
 141  141   *   needed.  Zones can also be searched by path; zone_find_by_path
 142  142   *   returns the zone with which a path name is associated (global
 143  143   *   zone if the path is not within some other zone's file system
 144  144   *   hierarchy).  This currently requires iterating through each zone,
 145  145   *   so it is slower than an id or name search via a hash table.
 146  146   *
 147  147   *
 148  148   *   Locking:
 149  149   *
 150  150   *   zonehash_lock: This is a top-level global lock used to protect the
 151  151   *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  152   *       while this lock is held.
 153  153   *   zone_status_lock: This is a global lock protecting zone state.
 154  154   *       Zones cannot change state while this lock is held.  It also
 155  155   *       protects the list of kernel threads associated with a zone.
 156  156   *   zone_lock: This is a per-zone lock used to protect several fields of
 157  157   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  158   *       this lock means that the zone cannot go away.
 159  159   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  160   *       related to the zone.max-lwps rctl.
 161  161   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  163   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  164   *       currently just max_lofi
 165  165   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  166   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  167   *       list (a list of zones in the ZONE_IS_DEAD state).
 168  168   *
 169  169   *   Ordering requirements:
 170  170   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  171   *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  172   *
 173  173   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  174   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  175   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  176   *
 177  177   *   Blocking memory allocations are permitted while holding any of the
 178  178   *   zone locks.
 179  179   *
 180  180   *
 181  181   *   System Call Interface:
 182  182   *
 183  183   *   The zone subsystem can be managed and queried from user level with
 184  184   *   the following system calls (all subcodes of the primary "zone"
 185  185   *   system call):
 186  186   *   - zone_create: creates a zone with selected attributes (name,
 187  187   *     root path, privileges, resource controls, ZFS datasets)
 188  188   *   - zone_enter: allows the current process to enter a zone
 189  189   *   - zone_getattr: reports attributes of a zone
 190  190   *   - zone_setattr: set attributes of a zone
 191  191   *   - zone_boot: set 'init' running for the zone
 192  192   *   - zone_list: lists all zones active in the system
 193  193   *   - zone_lookup: looks up zone id based on name
 194  194   *   - zone_shutdown: initiates shutdown process (see states above)
 195  195   *   - zone_destroy: completes shutdown process (see states above)
 196  196   *
 197  197   */
 198  198  
 199  199  #include <sys/priv_impl.h>
 200  200  #include <sys/cred.h>
 201  201  #include <c2/audit.h>
 202  202  #include <sys/debug.h>
 203  203  #include <sys/file.h>
 204  204  #include <sys/kmem.h>
 205  205  #include <sys/kstat.h>
 206  206  #include <sys/mutex.h>
 207  207  #include <sys/note.h>
 208  208  #include <sys/pathname.h>
 209  209  #include <sys/proc.h>
 210  210  #include <sys/project.h>
 211  211  #include <sys/sysevent.h>
 212  212  #include <sys/task.h>
 213  213  #include <sys/systm.h>
 214  214  #include <sys/types.h>
 215  215  #include <sys/utsname.h>
 216  216  #include <sys/vnode.h>
 217  217  #include <sys/vfs.h>
 218  218  #include <sys/systeminfo.h>
 219  219  #include <sys/policy.h>
 220  220  #include <sys/cred_impl.h>
 221  221  #include <sys/contract_impl.h>
 222  222  #include <sys/contract/process_impl.h>
 223  223  #include <sys/class.h>
 224  224  #include <sys/pool.h>
 225  225  #include <sys/pool_pset.h>
 226  226  #include <sys/pset.h>
 227  227  #include <sys/strlog.h>
 228  228  #include <sys/sysmacros.h>
 229  229  #include <sys/callb.h>
 230  230  #include <sys/vmparam.h>
 231  231  #include <sys/corectl.h>
 232  232  #include <sys/ipc_impl.h>
 233  233  #include <sys/klpd.h>
 234  234  
 235  235  #include <sys/door.h>
 236  236  #include <sys/cpuvar.h>
 237  237  #include <sys/sdt.h>
 238  238  
 239  239  #include <sys/uadmin.h>
 240  240  #include <sys/session.h>
 241  241  #include <sys/cmn_err.h>
 242  242  #include <sys/modhash.h>
 243  243  #include <sys/sunddi.h>
 244  244  #include <sys/nvpair.h>
 245  245  #include <sys/rctl.h>
 246  246  #include <sys/fss.h>
 247  247  #include <sys/brand.h>
 248  248  #include <sys/zone.h>
 249  249  #include <net/if.h>
 250  250  #include <sys/cpucaps.h>
 251  251  #include <vm/seg.h>
 252  252  #include <sys/mac.h>
 253  253  
 254  254  /*
 255  255   * This constant specifies the number of seconds that threads waiting for
 256  256   * subsystems to release a zone's general-purpose references will wait before
 257  257   * they log the zone's reference counts.  The constant's value shouldn't
 258  258   * be so small that reference counts are unnecessarily reported for zones
 259  259   * whose references are slowly released.  On the other hand, it shouldn't be so
 260  260   * large that users reboot their systems out of frustration over hung zones
 261  261   * before the system logs the zones' reference counts.
 262  262   */
 263  263  #define ZONE_DESTROY_TIMEOUT_SECS       60
 264  264  
 265  265  /* List of data link IDs which are accessible from the zone */
 266  266  typedef struct zone_dl {
 267  267          datalink_id_t   zdl_id;
 268  268          nvlist_t        *zdl_net;
 269  269          list_node_t     zdl_linkage;
 270  270  } zone_dl_t;
 271  271  
 272  272  /*
 273  273   * cv used to signal that all references to the zone have been released.  This
 274  274   * needs to be global since there may be multiple waiters, and the first to
 275  275   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  276   */
 277  277  static kcondvar_t zone_destroy_cv;
 278  278  /*
 279  279   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  280   * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  281   */
 282  282  static kmutex_t zone_status_lock;
 283  283  
 284  284  /*
 285  285   * ZSD-related global variables.
 286  286   */
 287  287  static kmutex_t zsd_key_lock;   /* protects the following two */
 288  288  /*
 289  289   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  290   */
 291  291  static zone_key_t zsd_keyval = 0;
 292  292  /*
 293  293   * Global list of registered keys.  We use this when a new zone is created.
 294  294   */
 295  295  static list_t zsd_registered_keys;
 296  296  
 297  297  int zone_hash_size = 256;
 298  298  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299  299  static kmutex_t zonehash_lock;
 300  300  static uint_t zonecount;
 301  301  static id_space_t *zoneid_space;
 302  302  
 303  303  /*
 304  304   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  305   * kernel proper runs, and which manages all other zones.
 306  306   *
 307  307   * Although not declared as static, the variable "zone0" should not be used
 308  308   * except for by code that needs to reference the global zone early on in boot,
 309  309   * before it is fully initialized.  All other consumers should use
 310  310   * 'global_zone'.
 311  311   */
 312  312  zone_t zone0;
 313  313  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314  314  
 315  315  /*
 316  316   * List of active zones, protected by zonehash_lock.
 317  317   */
 318  318  static list_t zone_active;
 319  319  
 320  320  /*
 321  321   * List of destroyed zones that still have outstanding cred references.
 322  322   * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  323   * problems in zone_free.
 324  324   */
 325  325  static list_t zone_deathrow;
 326  326  static kmutex_t zone_deathrow_lock;
 327  327  
 328  328  /* number of zones is limited by virtual interface limit in IP */
 329  329  uint_t maxzones = 8192;
 330  330  
 331  331  /* Event channel to sent zone state change notifications */
 332  332  evchan_t *zone_event_chan;
 333  333  
 334  334  /*
 335  335   * This table holds the mapping from kernel zone states to
 336  336   * states visible in the state notification API.
 337  337   * The idea is that we only expose "obvious" states and
 338  338   * do not expose states which are just implementation details.
 339  339   */
 340  340  const char  *zone_status_table[] = {
 341  341          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342  342          ZONE_EVENT_INITIALIZED,         /* initialized */
 343  343          ZONE_EVENT_READY,               /* ready */
 344  344          ZONE_EVENT_READY,               /* booting */
 345  345          ZONE_EVENT_RUNNING,             /* running */
 346  346          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347  347          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350  350          ZONE_EVENT_UNINITIALIZED,       /* dead */
 351  351  };
 352  352  
 353  353  /*
 354  354   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  355   * (see sys/zone.h).
 356  356   */
 357  357  static char *zone_ref_subsys_names[] = {
 358  358          "NFS",          /* ZONE_REF_NFS */
 359  359          "NFSv4",        /* ZONE_REF_NFSV4 */
 360  360          "SMBFS",        /* ZONE_REF_SMBFS */
 361  361          "MNTFS",        /* ZONE_REF_MNTFS */
 362  362          "LOFI",         /* ZONE_REF_LOFI */
 363  363          "VFS",          /* ZONE_REF_VFS */
 364  364          "IPC"           /* ZONE_REF_IPC */
 365  365  };
 366  366  
 367  367  /*
 368  368   * This isn't static so lint doesn't complain.
 369  369   */
 370  370  rctl_hndl_t rc_zone_cpu_shares;
 371  371  rctl_hndl_t rc_zone_locked_mem;
 372  372  rctl_hndl_t rc_zone_max_swap;
 373  373  rctl_hndl_t rc_zone_max_lofi;
 374  374  rctl_hndl_t rc_zone_cpu_cap;
 375  375  rctl_hndl_t rc_zone_nlwps;
 376  376  rctl_hndl_t rc_zone_nprocs;
 377  377  rctl_hndl_t rc_zone_shmmax;
 378  378  rctl_hndl_t rc_zone_shmmni;
 379  379  rctl_hndl_t rc_zone_semmni;
 380  380  rctl_hndl_t rc_zone_msgmni;
 381  381  
 382  382  const char * const zone_default_initname = "/sbin/init";
 383  383  static char * const zone_prefix = "/zone/";
 384  384  static int zone_shutdown(zoneid_t zoneid);
 385  385  static int zone_add_datalink(zoneid_t, datalink_id_t);
 386  386  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 387  387  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 388  388  static int zone_set_network(zoneid_t, zone_net_data_t *);
 389  389  static int zone_get_network(zoneid_t, zone_net_data_t *);
 390  390  
 391  391  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 392  392  
 393  393  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 394  394  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 395  395  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396  396  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 397  397      zone_key_t);
 398  398  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 399  399  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 400  400      kmutex_t *);
 401  401  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  
 404  404  /*
 405  405   * Bump this number when you alter the zone syscall interfaces; this is
 406  406   * because we need to have support for previous API versions in libc
 407  407   * to support patching; libc calls into the kernel to determine this number.
 408  408   *
 409  409   * Version 1 of the API is the version originally shipped with Solaris 10
 410  410   * Version 2 alters the zone_create system call in order to support more
 411  411   *     arguments by moving the args into a structure; and to do better
 412  412   *     error reporting when zone_create() fails.
 413  413   * Version 3 alters the zone_create system call in order to support the
 414  414   *     import of ZFS datasets to zones.
 415  415   * Version 4 alters the zone_create system call in order to support
 416  416   *     Trusted Extensions.
 417  417   * Version 5 alters the zone_boot system call, and converts its old
 418  418   *     bootargs parameter to be set by the zone_setattr API instead.
 419  419   * Version 6 adds the flag argument to zone_create.
 420  420   */
 421  421  static const int ZONE_SYSCALL_API_VERSION = 6;
 422  422  
 423  423  /*
 424  424   * Certain filesystems (such as NFS and autofs) need to know which zone
 425  425   * the mount is being placed in.  Because of this, we need to be able to
 426  426   * ensure that a zone isn't in the process of being created/destroyed such
 427  427   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 428  428   * it gets added the list of mounted zones, it ends up on the wrong zone's
 429  429   * mount list. Since a zone can't reside on an NFS file system, we don't
 430  430   * have to worry about the zonepath itself.
 431  431   *
 432  432   * The following functions: block_mounts()/resume_mounts() and
 433  433   * mount_in_progress()/mount_completed() are used by zones and the VFS
 434  434   * layer (respectively) to synchronize zone state transitions and new
 435  435   * mounts within a zone. This syncronization is on a per-zone basis, so
 436  436   * activity for one zone will not interfere with activity for another zone.
 437  437   *
 438  438   * The semantics are like a reader-reader lock such that there may
 439  439   * either be multiple mounts (or zone state transitions, if that weren't
 440  440   * serialized by zonehash_lock) in progress at the same time, but not
 441  441   * both.
 442  442   *
 443  443   * We use cv's so the user can ctrl-C out of the operation if it's
 444  444   * taking too long.
 445  445   *
 446  446   * The semantics are such that there is unfair bias towards the
 447  447   * "current" operation.  This means that zone halt may starve if
 448  448   * there is a rapid succession of new mounts coming in to the zone.
 449  449   */
 450  450  /*
 451  451   * Prevent new mounts from progressing to the point of calling
 452  452   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 453  453   * them to complete.
 454  454   */
 455  455  static int
 456  456  block_mounts(zone_t *zp)
 457  457  {
 458  458          int retval = 0;
 459  459  
 460  460          /*
 461  461           * Since it may block for a long time, block_mounts() shouldn't be
 462  462           * called with zonehash_lock held.
 463  463           */
 464  464          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 465  465          mutex_enter(&zp->zone_mount_lock);
 466  466          while (zp->zone_mounts_in_progress > 0) {
 467  467                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 468  468                          goto signaled;
 469  469          }
 470  470          /*
 471  471           * A negative value of mounts_in_progress indicates that mounts
 472  472           * have been blocked by (-mounts_in_progress) different callers
 473  473           * (remotely possible if two threads enter zone_shutdown at the same
 474  474           * time).
 475  475           */
 476  476          zp->zone_mounts_in_progress--;
 477  477          retval = 1;
 478  478  signaled:
 479  479          mutex_exit(&zp->zone_mount_lock);
 480  480          return (retval);
 481  481  }
 482  482  
 483  483  /*
 484  484   * The VFS layer may progress with new mounts as far as we're concerned.
 485  485   * Allow them to progress if we were the last obstacle.
 486  486   */
 487  487  static void
 488  488  resume_mounts(zone_t *zp)
 489  489  {
 490  490          mutex_enter(&zp->zone_mount_lock);
 491  491          if (++zp->zone_mounts_in_progress == 0)
 492  492                  cv_broadcast(&zp->zone_mount_cv);
 493  493          mutex_exit(&zp->zone_mount_lock);
 494  494  }
 495  495  
 496  496  /*
 497  497   * The VFS layer is busy with a mount; this zone should wait until all
 498  498   * of its mounts are completed to progress.
 499  499   */
 500  500  void
 501  501  mount_in_progress(zone_t *zp)
 502  502  {
 503  503          mutex_enter(&zp->zone_mount_lock);
 504  504          while (zp->zone_mounts_in_progress < 0)
 505  505                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 506  506          zp->zone_mounts_in_progress++;
 507  507          mutex_exit(&zp->zone_mount_lock);
 508  508  }
 509  509  
 510  510  /*
 511  511   * VFS is done with one mount; wake up any waiting block_mounts()
 512  512   * callers if this is the last mount.
 513  513   */
 514  514  void
 515  515  mount_completed(zone_t *zp)
 516  516  {
 517  517          mutex_enter(&zp->zone_mount_lock);
 518  518          if (--zp->zone_mounts_in_progress == 0)
 519  519                  cv_broadcast(&zp->zone_mount_cv);
 520  520          mutex_exit(&zp->zone_mount_lock);
 521  521  }
 522  522  
 523  523  /*
 524  524   * ZSD routines.
 525  525   *
 526  526   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 527  527   * defined by the pthread_key_create() and related interfaces.
 528  528   *
 529  529   * Kernel subsystems may register one or more data items and/or
 530  530   * callbacks to be executed when a zone is created, shutdown, or
 531  531   * destroyed.
 532  532   *
 533  533   * Unlike the thread counterpart, destructor callbacks will be executed
 534  534   * even if the data pointer is NULL and/or there are no constructor
 535  535   * callbacks, so it is the responsibility of such callbacks to check for
 536  536   * NULL data values if necessary.
 537  537   *
 538  538   * The locking strategy and overall picture is as follows:
 539  539   *
 540  540   * When someone calls zone_key_create(), a template ZSD entry is added to the
 541  541   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 542  542   * holding that lock all the existing zones are marked as
 543  543   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 544  544   * zone_zsd list (protected by zone_lock). The global list is updated first
 545  545   * (under zone_key_lock) to make sure that newly created zones use the
 546  546   * most recent list of keys. Then under zonehash_lock we walk the zones
 547  547   * and mark them.  Similar locking is used in zone_key_delete().
 548  548   *
 549  549   * The actual create, shutdown, and destroy callbacks are done without
 550  550   * holding any lock. And zsd_flags are used to ensure that the operations
 551  551   * completed so that when zone_key_create (and zone_create) is done, as well as
 552  552   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 553  553   * are completed.
 554  554   *
 555  555   * When new zones are created constructor callbacks for all registered ZSD
 556  556   * entries will be called. That also uses the above two phases of marking
 557  557   * what needs to be done, and then running the callbacks without holding
 558  558   * any locks.
 559  559   *
 560  560   * The framework does not provide any locking around zone_getspecific() and
 561  561   * zone_setspecific() apart from that needed for internal consistency, so
 562  562   * callers interested in atomic "test-and-set" semantics will need to provide
 563  563   * their own locking.
 564  564   */
 565  565  
 566  566  /*
 567  567   * Helper function to find the zsd_entry associated with the key in the
 568  568   * given list.
 569  569   */
 570  570  static struct zsd_entry *
 571  571  zsd_find(list_t *l, zone_key_t key)
 572  572  {
 573  573          struct zsd_entry *zsd;
 574  574  
 575  575          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 576  576                  if (zsd->zsd_key == key) {
 577  577                          return (zsd);
 578  578                  }
 579  579          }
 580  580          return (NULL);
 581  581  }
 582  582  
 583  583  /*
 584  584   * Helper function to find the zsd_entry associated with the key in the
 585  585   * given list. Move it to the front of the list.
 586  586   */
 587  587  static struct zsd_entry *
 588  588  zsd_find_mru(list_t *l, zone_key_t key)
 589  589  {
 590  590          struct zsd_entry *zsd;
 591  591  
 592  592          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 593  593                  if (zsd->zsd_key == key) {
 594  594                          /*
 595  595                           * Move to head of list to keep list in MRU order.
 596  596                           */
 597  597                          if (zsd != list_head(l)) {
 598  598                                  list_remove(l, zsd);
 599  599                                  list_insert_head(l, zsd);
 600  600                          }
 601  601                          return (zsd);
 602  602                  }
 603  603          }
 604  604          return (NULL);
 605  605  }
 606  606  
 607  607  void
 608  608  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 609  609      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 610  610  {
 611  611          struct zsd_entry *zsdp;
 612  612          struct zsd_entry *t;
 613  613          struct zone *zone;
 614  614          zone_key_t  key;
 615  615  
 616  616          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 617  617          zsdp->zsd_data = NULL;
 618  618          zsdp->zsd_create = create;
 619  619          zsdp->zsd_shutdown = shutdown;
 620  620          zsdp->zsd_destroy = destroy;
 621  621  
 622  622          /*
 623  623           * Insert in global list of callbacks. Makes future zone creations
 624  624           * see it.
 625  625           */
 626  626          mutex_enter(&zsd_key_lock);
 627  627          key = zsdp->zsd_key = ++zsd_keyval;
 628  628          ASSERT(zsd_keyval != 0);
 629  629          list_insert_tail(&zsd_registered_keys, zsdp);
 630  630          mutex_exit(&zsd_key_lock);
 631  631  
 632  632          /*
 633  633           * Insert for all existing zones and mark them as needing
 634  634           * a create callback.
 635  635           */
 636  636          mutex_enter(&zonehash_lock);    /* stop the world */
 637  637          for (zone = list_head(&zone_active); zone != NULL;
 638  638              zone = list_next(&zone_active, zone)) {
 639  639                  zone_status_t status;
 640  640  
 641  641                  mutex_enter(&zone->zone_lock);
 642  642  
 643  643                  /* Skip zones that are on the way down or not yet up */
 644  644                  status = zone_status_get(zone);
 645  645                  if (status >= ZONE_IS_DOWN ||
 646  646                      status == ZONE_IS_UNINITIALIZED) {
 647  647                          mutex_exit(&zone->zone_lock);
 648  648                          continue;
 649  649                  }
 650  650  
 651  651                  t = zsd_find_mru(&zone->zone_zsd, key);
 652  652                  if (t != NULL) {
 653  653                          /*
 654  654                           * A zsd_configure already inserted it after
 655  655                           * we dropped zsd_key_lock above.
 656  656                           */
 657  657                          mutex_exit(&zone->zone_lock);
 658  658                          continue;
 659  659                  }
 660  660                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 661  661                  t->zsd_key = key;
 662  662                  t->zsd_create = create;
 663  663                  t->zsd_shutdown = shutdown;
 664  664                  t->zsd_destroy = destroy;
 665  665                  if (create != NULL) {
 666  666                          t->zsd_flags = ZSD_CREATE_NEEDED;
 667  667                          DTRACE_PROBE2(zsd__create__needed,
 668  668                              zone_t *, zone, zone_key_t, key);
 669  669                  }
 670  670                  list_insert_tail(&zone->zone_zsd, t);
 671  671                  mutex_exit(&zone->zone_lock);
 672  672          }
 673  673          mutex_exit(&zonehash_lock);
 674  674  
 675  675          if (create != NULL) {
 676  676                  /* Now call the create callback for this key */
 677  677                  zsd_apply_all_zones(zsd_apply_create, key);
 678  678          }
 679  679          /*
 680  680           * It is safe for consumers to use the key now, make it
 681  681           * globally visible. Specifically zone_getspecific() will
 682  682           * always successfully return the zone specific data associated
 683  683           * with the key.
 684  684           */
 685  685          *keyp = key;
 686  686  
 687  687  }
 688  688  
 689  689  /*
 690  690   * Function called when a module is being unloaded, or otherwise wishes
 691  691   * to unregister its ZSD key and callbacks.
 692  692   *
 693  693   * Remove from the global list and determine the functions that need to
 694  694   * be called under a global lock. Then call the functions without
 695  695   * holding any locks. Finally free up the zone_zsd entries. (The apply
 696  696   * functions need to access the zone_zsd entries to find zsd_data etc.)
 697  697   */
 698  698  int
 699  699  zone_key_delete(zone_key_t key)
 700  700  {
 701  701          struct zsd_entry *zsdp = NULL;
 702  702          zone_t *zone;
 703  703  
 704  704          mutex_enter(&zsd_key_lock);
 705  705          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 706  706          if (zsdp == NULL) {
 707  707                  mutex_exit(&zsd_key_lock);
 708  708                  return (-1);
 709  709          }
 710  710          list_remove(&zsd_registered_keys, zsdp);
 711  711          mutex_exit(&zsd_key_lock);
 712  712  
 713  713          mutex_enter(&zonehash_lock);
 714  714          for (zone = list_head(&zone_active); zone != NULL;
 715  715              zone = list_next(&zone_active, zone)) {
 716  716                  struct zsd_entry *del;
 717  717  
 718  718                  mutex_enter(&zone->zone_lock);
 719  719                  del = zsd_find_mru(&zone->zone_zsd, key);
 720  720                  if (del == NULL) {
 721  721                          /*
 722  722                           * Somebody else got here first e.g the zone going
 723  723                           * away.
 724  724                           */
 725  725                          mutex_exit(&zone->zone_lock);
 726  726                          continue;
 727  727                  }
 728  728                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 729  729                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 730  730                  if (del->zsd_shutdown != NULL &&
 731  731                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 732  732                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 733  733                          DTRACE_PROBE2(zsd__shutdown__needed,
 734  734                              zone_t *, zone, zone_key_t, key);
 735  735                  }
 736  736                  if (del->zsd_destroy != NULL &&
 737  737                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 738  738                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 739  739                          DTRACE_PROBE2(zsd__destroy__needed,
 740  740                              zone_t *, zone, zone_key_t, key);
 741  741                  }
 742  742                  mutex_exit(&zone->zone_lock);
 743  743          }
 744  744          mutex_exit(&zonehash_lock);
 745  745          kmem_free(zsdp, sizeof (*zsdp));
 746  746  
 747  747          /* Now call the shutdown and destroy callback for this key */
 748  748          zsd_apply_all_zones(zsd_apply_shutdown, key);
 749  749          zsd_apply_all_zones(zsd_apply_destroy, key);
 750  750  
 751  751          /* Now we can free up the zsdp structures in each zone */
 752  752          mutex_enter(&zonehash_lock);
 753  753          for (zone = list_head(&zone_active); zone != NULL;
 754  754              zone = list_next(&zone_active, zone)) {
 755  755                  struct zsd_entry *del;
 756  756  
 757  757                  mutex_enter(&zone->zone_lock);
 758  758                  del = zsd_find(&zone->zone_zsd, key);
 759  759                  if (del != NULL) {
 760  760                          list_remove(&zone->zone_zsd, del);
 761  761                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 762  762                          kmem_free(del, sizeof (*del));
 763  763                  }
 764  764                  mutex_exit(&zone->zone_lock);
 765  765          }
 766  766          mutex_exit(&zonehash_lock);
 767  767  
 768  768          return (0);
 769  769  }
 770  770  
 771  771  /*
 772  772   * ZSD counterpart of pthread_setspecific().
 773  773   *
 774  774   * Since all zsd callbacks, including those with no create function,
 775  775   * have an entry in zone_zsd, if the key is registered it is part of
 776  776   * the zone_zsd list.
 777  777   * Return an error if the key wasn't registerd.
 778  778   */
 779  779  int
 780  780  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 781  781  {
 782  782          struct zsd_entry *t;
 783  783  
 784  784          mutex_enter(&zone->zone_lock);
 785  785          t = zsd_find_mru(&zone->zone_zsd, key);
 786  786          if (t != NULL) {
 787  787                  /*
 788  788                   * Replace old value with new
 789  789                   */
 790  790                  t->zsd_data = (void *)data;
 791  791                  mutex_exit(&zone->zone_lock);
 792  792                  return (0);
 793  793          }
 794  794          mutex_exit(&zone->zone_lock);
 795  795          return (-1);
 796  796  }
 797  797  
 798  798  /*
 799  799   * ZSD counterpart of pthread_getspecific().
 800  800   */
 801  801  void *
 802  802  zone_getspecific(zone_key_t key, zone_t *zone)
 803  803  {
 804  804          struct zsd_entry *t;
 805  805          void *data;
 806  806  
 807  807          mutex_enter(&zone->zone_lock);
 808  808          t = zsd_find_mru(&zone->zone_zsd, key);
 809  809          data = (t == NULL ? NULL : t->zsd_data);
 810  810          mutex_exit(&zone->zone_lock);
 811  811          return (data);
 812  812  }
 813  813  
 814  814  /*
 815  815   * Function used to initialize a zone's list of ZSD callbacks and data
 816  816   * when the zone is being created.  The callbacks are initialized from
 817  817   * the template list (zsd_registered_keys). The constructor callback is
 818  818   * executed later (once the zone exists and with locks dropped).
 819  819   */
 820  820  static void
 821  821  zone_zsd_configure(zone_t *zone)
 822  822  {
 823  823          struct zsd_entry *zsdp;
 824  824          struct zsd_entry *t;
 825  825  
 826  826          ASSERT(MUTEX_HELD(&zonehash_lock));
 827  827          ASSERT(list_head(&zone->zone_zsd) == NULL);
 828  828          mutex_enter(&zone->zone_lock);
 829  829          mutex_enter(&zsd_key_lock);
 830  830          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 831  831              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 832  832                  /*
 833  833                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 834  834                   * should not have added anything to it.
 835  835                   */
 836  836                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 837  837  
 838  838                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 839  839                  t->zsd_key = zsdp->zsd_key;
 840  840                  t->zsd_create = zsdp->zsd_create;
 841  841                  t->zsd_shutdown = zsdp->zsd_shutdown;
 842  842                  t->zsd_destroy = zsdp->zsd_destroy;
 843  843                  if (zsdp->zsd_create != NULL) {
 844  844                          t->zsd_flags = ZSD_CREATE_NEEDED;
 845  845                          DTRACE_PROBE2(zsd__create__needed,
 846  846                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 847  847                  }
 848  848                  list_insert_tail(&zone->zone_zsd, t);
 849  849          }
 850  850          mutex_exit(&zsd_key_lock);
 851  851          mutex_exit(&zone->zone_lock);
 852  852  }
 853  853  
 854  854  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 855  855  
 856  856  /*
 857  857   * Helper function to execute shutdown or destructor callbacks.
 858  858   */
 859  859  static void
 860  860  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 861  861  {
 862  862          struct zsd_entry *t;
 863  863  
 864  864          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 865  865          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 866  866          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 867  867  
 868  868          /*
 869  869           * Run the callback solely based on what is registered for the zone
 870  870           * in zone_zsd. The global list can change independently of this
 871  871           * as keys are registered and unregistered and we don't register new
 872  872           * callbacks for a zone that is in the process of going away.
 873  873           */
 874  874          mutex_enter(&zone->zone_lock);
 875  875          for (t = list_head(&zone->zone_zsd); t != NULL;
 876  876              t = list_next(&zone->zone_zsd, t)) {
 877  877                  zone_key_t key = t->zsd_key;
 878  878  
 879  879                  /* Skip if no callbacks registered */
 880  880  
 881  881                  if (ct == ZSD_SHUTDOWN) {
 882  882                          if (t->zsd_shutdown != NULL &&
 883  883                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 884  884                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 885  885                                  DTRACE_PROBE2(zsd__shutdown__needed,
 886  886                                      zone_t *, zone, zone_key_t, key);
 887  887                          }
 888  888                  } else {
 889  889                          if (t->zsd_destroy != NULL &&
 890  890                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 891  891                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 892  892                                  DTRACE_PROBE2(zsd__destroy__needed,
 893  893                                      zone_t *, zone, zone_key_t, key);
 894  894                          }
 895  895                  }
 896  896          }
 897  897          mutex_exit(&zone->zone_lock);
 898  898  
 899  899          /* Now call the shutdown and destroy callback for this key */
 900  900          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 901  901          zsd_apply_all_keys(zsd_apply_destroy, zone);
 902  902  
 903  903  }
 904  904  
 905  905  /*
 906  906   * Called when the zone is going away; free ZSD-related memory, and
 907  907   * destroy the zone_zsd list.
 908  908   */
 909  909  static void
 910  910  zone_free_zsd(zone_t *zone)
 911  911  {
 912  912          struct zsd_entry *t, *next;
 913  913  
 914  914          /*
 915  915           * Free all the zsd_entry's we had on this zone.
 916  916           */
 917  917          mutex_enter(&zone->zone_lock);
 918  918          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 919  919                  next = list_next(&zone->zone_zsd, t);
 920  920                  list_remove(&zone->zone_zsd, t);
 921  921                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 922  922                  kmem_free(t, sizeof (*t));
 923  923          }
 924  924          list_destroy(&zone->zone_zsd);
 925  925          mutex_exit(&zone->zone_lock);
 926  926  
 927  927  }
 928  928  
 929  929  /*
 930  930   * Apply a function to all zones for particular key value.
 931  931   *
 932  932   * The applyfn has to drop zonehash_lock if it does some work, and
 933  933   * then reacquire it before it returns.
 934  934   * When the lock is dropped we don't follow list_next even
 935  935   * if it is possible to do so without any hazards. This is
 936  936   * because we want the design to allow for the list of zones
 937  937   * to change in any arbitrary way during the time the
 938  938   * lock was dropped.
 939  939   *
 940  940   * It is safe to restart the loop at list_head since the applyfn
 941  941   * changes the zsd_flags as it does work, so a subsequent
 942  942   * pass through will have no effect in applyfn, hence the loop will terminate
 943  943   * in at worst O(N^2).
 944  944   */
 945  945  static void
 946  946  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 947  947  {
 948  948          zone_t *zone;
 949  949  
 950  950          mutex_enter(&zonehash_lock);
 951  951          zone = list_head(&zone_active);
 952  952          while (zone != NULL) {
 953  953                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 954  954                          /* Lock dropped - restart at head */
 955  955                          zone = list_head(&zone_active);
 956  956                  } else {
 957  957                          zone = list_next(&zone_active, zone);
 958  958                  }
 959  959          }
 960  960          mutex_exit(&zonehash_lock);
 961  961  }
 962  962  
 963  963  /*
 964  964   * Apply a function to all keys for a particular zone.
 965  965   *
 966  966   * The applyfn has to drop zonehash_lock if it does some work, and
 967  967   * then reacquire it before it returns.
 968  968   * When the lock is dropped we don't follow list_next even
 969  969   * if it is possible to do so without any hazards. This is
 970  970   * because we want the design to allow for the list of zsd callbacks
 971  971   * to change in any arbitrary way during the time the
 972  972   * lock was dropped.
 973  973   *
 974  974   * It is safe to restart the loop at list_head since the applyfn
 975  975   * changes the zsd_flags as it does work, so a subsequent
 976  976   * pass through will have no effect in applyfn, hence the loop will terminate
 977  977   * in at worst O(N^2).
 978  978   */
 979  979  static void
 980  980  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 981  981  {
 982  982          struct zsd_entry *t;
 983  983  
 984  984          mutex_enter(&zone->zone_lock);
 985  985          t = list_head(&zone->zone_zsd);
 986  986          while (t != NULL) {
 987  987                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 988  988                          /* Lock dropped - restart at head */
 989  989                          t = list_head(&zone->zone_zsd);
 990  990                  } else {
 991  991                          t = list_next(&zone->zone_zsd, t);
 992  992                  }
 993  993          }
 994  994          mutex_exit(&zone->zone_lock);
 995  995  }
 996  996  
 997  997  /*
 998  998   * Call the create function for the zone and key if CREATE_NEEDED
 999  999   * is set.
1000 1000   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1001 1001   * we wait for that thread to complete so that we can ensure that
1002 1002   * all the callbacks are done when we've looped over all zones/keys.
1003 1003   *
1004 1004   * When we call the create function, we drop the global held by the
1005 1005   * caller, and return true to tell the caller it needs to re-evalute the
1006 1006   * state.
1007 1007   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1008 1008   * remains held on exit.
1009 1009   */
1010 1010  static boolean_t
1011 1011  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1012 1012      zone_t *zone, zone_key_t key)
1013 1013  {
1014 1014          void *result;
1015 1015          struct zsd_entry *t;
1016 1016          boolean_t dropped;
1017 1017  
1018 1018          if (lockp != NULL) {
1019 1019                  ASSERT(MUTEX_HELD(lockp));
1020 1020          }
1021 1021          if (zone_lock_held) {
1022 1022                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1023 1023          } else {
1024 1024                  mutex_enter(&zone->zone_lock);
1025 1025          }
1026 1026  
1027 1027          t = zsd_find(&zone->zone_zsd, key);
1028 1028          if (t == NULL) {
1029 1029                  /*
1030 1030                   * Somebody else got here first e.g the zone going
1031 1031                   * away.
1032 1032                   */
1033 1033                  if (!zone_lock_held)
1034 1034                          mutex_exit(&zone->zone_lock);
1035 1035                  return (B_FALSE);
1036 1036          }
1037 1037          dropped = B_FALSE;
1038 1038          if (zsd_wait_for_inprogress(zone, t, lockp))
1039 1039                  dropped = B_TRUE;
1040 1040  
1041 1041          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1042 1042                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1043 1043                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1044 1044                  DTRACE_PROBE2(zsd__create__inprogress,
1045 1045                      zone_t *, zone, zone_key_t, key);
1046 1046                  mutex_exit(&zone->zone_lock);
1047 1047                  if (lockp != NULL)
1048 1048                          mutex_exit(lockp);
1049 1049  
1050 1050                  dropped = B_TRUE;
1051 1051                  ASSERT(t->zsd_create != NULL);
1052 1052                  DTRACE_PROBE2(zsd__create__start,
1053 1053                      zone_t *, zone, zone_key_t, key);
1054 1054  
1055 1055                  result = (*t->zsd_create)(zone->zone_id);
1056 1056  
1057 1057                  DTRACE_PROBE2(zsd__create__end,
1058 1058                      zone_t *, zone, voidn *, result);
1059 1059  
1060 1060                  ASSERT(result != NULL);
1061 1061                  if (lockp != NULL)
1062 1062                          mutex_enter(lockp);
1063 1063                  mutex_enter(&zone->zone_lock);
1064 1064                  t->zsd_data = result;
1065 1065                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1066 1066                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1067 1067                  cv_broadcast(&t->zsd_cv);
1068 1068                  DTRACE_PROBE2(zsd__create__completed,
1069 1069                      zone_t *, zone, zone_key_t, key);
1070 1070          }
1071 1071          if (!zone_lock_held)
1072 1072                  mutex_exit(&zone->zone_lock);
1073 1073          return (dropped);
1074 1074  }
1075 1075  
1076 1076  /*
1077 1077   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1078 1078   * is set.
1079 1079   * If some other thread gets here first and sets *_INPROGRESS, then
1080 1080   * we wait for that thread to complete so that we can ensure that
1081 1081   * all the callbacks are done when we've looped over all zones/keys.
1082 1082   *
1083 1083   * When we call the shutdown function, we drop the global held by the
1084 1084   * caller, and return true to tell the caller it needs to re-evalute the
1085 1085   * state.
1086 1086   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1087 1087   * remains held on exit.
1088 1088   */
1089 1089  static boolean_t
1090 1090  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1091 1091      zone_t *zone, zone_key_t key)
1092 1092  {
1093 1093          struct zsd_entry *t;
1094 1094          void *data;
1095 1095          boolean_t dropped;
1096 1096  
1097 1097          if (lockp != NULL) {
1098 1098                  ASSERT(MUTEX_HELD(lockp));
1099 1099          }
1100 1100          if (zone_lock_held) {
1101 1101                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1102 1102          } else {
1103 1103                  mutex_enter(&zone->zone_lock);
1104 1104          }
1105 1105  
1106 1106          t = zsd_find(&zone->zone_zsd, key);
1107 1107          if (t == NULL) {
1108 1108                  /*
1109 1109                   * Somebody else got here first e.g the zone going
1110 1110                   * away.
1111 1111                   */
1112 1112                  if (!zone_lock_held)
1113 1113                          mutex_exit(&zone->zone_lock);
1114 1114                  return (B_FALSE);
1115 1115          }
1116 1116          dropped = B_FALSE;
1117 1117          if (zsd_wait_for_creator(zone, t, lockp))
1118 1118                  dropped = B_TRUE;
1119 1119  
1120 1120          if (zsd_wait_for_inprogress(zone, t, lockp))
1121 1121                  dropped = B_TRUE;
1122 1122  
1123 1123          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1124 1124                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1125 1125                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1126 1126                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1127 1127                      zone_t *, zone, zone_key_t, key);
1128 1128                  mutex_exit(&zone->zone_lock);
1129 1129                  if (lockp != NULL)
1130 1130                          mutex_exit(lockp);
1131 1131                  dropped = B_TRUE;
1132 1132  
1133 1133                  ASSERT(t->zsd_shutdown != NULL);
1134 1134                  data = t->zsd_data;
1135 1135  
1136 1136                  DTRACE_PROBE2(zsd__shutdown__start,
1137 1137                      zone_t *, zone, zone_key_t, key);
1138 1138  
1139 1139                  (t->zsd_shutdown)(zone->zone_id, data);
1140 1140                  DTRACE_PROBE2(zsd__shutdown__end,
1141 1141                      zone_t *, zone, zone_key_t, key);
1142 1142  
1143 1143                  if (lockp != NULL)
1144 1144                          mutex_enter(lockp);
1145 1145                  mutex_enter(&zone->zone_lock);
1146 1146                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1147 1147                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1148 1148                  cv_broadcast(&t->zsd_cv);
1149 1149                  DTRACE_PROBE2(zsd__shutdown__completed,
1150 1150                      zone_t *, zone, zone_key_t, key);
1151 1151          }
1152 1152          if (!zone_lock_held)
1153 1153                  mutex_exit(&zone->zone_lock);
1154 1154          return (dropped);
1155 1155  }
1156 1156  
1157 1157  /*
1158 1158   * Call the destroy function for the zone and key if DESTROY_NEEDED
1159 1159   * is set.
1160 1160   * If some other thread gets here first and sets *_INPROGRESS, then
1161 1161   * we wait for that thread to complete so that we can ensure that
1162 1162   * all the callbacks are done when we've looped over all zones/keys.
1163 1163   *
1164 1164   * When we call the destroy function, we drop the global held by the
1165 1165   * caller, and return true to tell the caller it needs to re-evalute the
1166 1166   * state.
1167 1167   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1168 1168   * remains held on exit.
1169 1169   */
1170 1170  static boolean_t
1171 1171  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1172 1172      zone_t *zone, zone_key_t key)
1173 1173  {
1174 1174          struct zsd_entry *t;
1175 1175          void *data;
1176 1176          boolean_t dropped;
1177 1177  
1178 1178          if (lockp != NULL) {
1179 1179                  ASSERT(MUTEX_HELD(lockp));
1180 1180          }
1181 1181          if (zone_lock_held) {
1182 1182                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1183 1183          } else {
1184 1184                  mutex_enter(&zone->zone_lock);
1185 1185          }
1186 1186  
1187 1187          t = zsd_find(&zone->zone_zsd, key);
1188 1188          if (t == NULL) {
1189 1189                  /*
1190 1190                   * Somebody else got here first e.g the zone going
1191 1191                   * away.
1192 1192                   */
1193 1193                  if (!zone_lock_held)
1194 1194                          mutex_exit(&zone->zone_lock);
1195 1195                  return (B_FALSE);
1196 1196          }
1197 1197          dropped = B_FALSE;
1198 1198          if (zsd_wait_for_creator(zone, t, lockp))
1199 1199                  dropped = B_TRUE;
1200 1200  
1201 1201          if (zsd_wait_for_inprogress(zone, t, lockp))
1202 1202                  dropped = B_TRUE;
1203 1203  
1204 1204          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1205 1205                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1206 1206                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1207 1207                  DTRACE_PROBE2(zsd__destroy__inprogress,
1208 1208                      zone_t *, zone, zone_key_t, key);
1209 1209                  mutex_exit(&zone->zone_lock);
1210 1210                  if (lockp != NULL)
1211 1211                          mutex_exit(lockp);
1212 1212                  dropped = B_TRUE;
1213 1213  
1214 1214                  ASSERT(t->zsd_destroy != NULL);
1215 1215                  data = t->zsd_data;
1216 1216                  DTRACE_PROBE2(zsd__destroy__start,
1217 1217                      zone_t *, zone, zone_key_t, key);
1218 1218  
1219 1219                  (t->zsd_destroy)(zone->zone_id, data);
1220 1220                  DTRACE_PROBE2(zsd__destroy__end,
1221 1221                      zone_t *, zone, zone_key_t, key);
1222 1222  
1223 1223                  if (lockp != NULL)
1224 1224                          mutex_enter(lockp);
1225 1225                  mutex_enter(&zone->zone_lock);
1226 1226                  t->zsd_data = NULL;
1227 1227                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1228 1228                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1229 1229                  cv_broadcast(&t->zsd_cv);
1230 1230                  DTRACE_PROBE2(zsd__destroy__completed,
1231 1231                      zone_t *, zone, zone_key_t, key);
1232 1232          }
1233 1233          if (!zone_lock_held)
1234 1234                  mutex_exit(&zone->zone_lock);
1235 1235          return (dropped);
1236 1236  }
1237 1237  
1238 1238  /*
1239 1239   * Wait for any CREATE_NEEDED flag to be cleared.
1240 1240   * Returns true if lockp was temporarily dropped while waiting.
1241 1241   */
1242 1242  static boolean_t
1243 1243  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1244 1244  {
1245 1245          boolean_t dropped = B_FALSE;
1246 1246  
1247 1247          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1248 1248                  DTRACE_PROBE2(zsd__wait__for__creator,
1249 1249                      zone_t *, zone, struct zsd_entry *, t);
1250 1250                  if (lockp != NULL) {
1251 1251                          dropped = B_TRUE;
1252 1252                          mutex_exit(lockp);
1253 1253                  }
1254 1254                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1255 1255                  if (lockp != NULL) {
1256 1256                          /* First drop zone_lock to preserve order */
1257 1257                          mutex_exit(&zone->zone_lock);
1258 1258                          mutex_enter(lockp);
1259 1259                          mutex_enter(&zone->zone_lock);
1260 1260                  }
1261 1261          }
1262 1262          return (dropped);
1263 1263  }
1264 1264  
1265 1265  /*
1266 1266   * Wait for any INPROGRESS flag to be cleared.
1267 1267   * Returns true if lockp was temporarily dropped while waiting.
1268 1268   */
1269 1269  static boolean_t
1270 1270  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1271 1271  {
1272 1272          boolean_t dropped = B_FALSE;
1273 1273  
1274 1274          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1275 1275                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1276 1276                      zone_t *, zone, struct zsd_entry *, t);
1277 1277                  if (lockp != NULL) {
1278 1278                          dropped = B_TRUE;
1279 1279                          mutex_exit(lockp);
1280 1280                  }
1281 1281                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1282 1282                  if (lockp != NULL) {
1283 1283                          /* First drop zone_lock to preserve order */
1284 1284                          mutex_exit(&zone->zone_lock);
1285 1285                          mutex_enter(lockp);
1286 1286                          mutex_enter(&zone->zone_lock);
1287 1287                  }
1288 1288          }
1289 1289          return (dropped);
1290 1290  }
1291 1291  
1292 1292  /*
1293 1293   * Frees memory associated with the zone dataset list.
1294 1294   */
1295 1295  static void
1296 1296  zone_free_datasets(zone_t *zone)
1297 1297  {
1298 1298          zone_dataset_t *t, *next;
1299 1299  
1300 1300          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1301 1301                  next = list_next(&zone->zone_datasets, t);
1302 1302                  list_remove(&zone->zone_datasets, t);
1303 1303                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1304 1304                  kmem_free(t, sizeof (*t));
1305 1305          }
1306 1306          list_destroy(&zone->zone_datasets);
1307 1307  }
1308 1308  
1309 1309  /*
1310 1310   * zone.cpu-shares resource control support.
1311 1311   */
1312 1312  /*ARGSUSED*/
1313 1313  static rctl_qty_t
1314 1314  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1315 1315  {
1316 1316          ASSERT(MUTEX_HELD(&p->p_lock));
1317 1317          return (p->p_zone->zone_shares);
1318 1318  }
1319 1319  
1320 1320  /*ARGSUSED*/
1321 1321  static int
1322 1322  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1323 1323      rctl_qty_t nv)
1324 1324  {
1325 1325          ASSERT(MUTEX_HELD(&p->p_lock));
1326 1326          ASSERT(e->rcep_t == RCENTITY_ZONE);
1327 1327          if (e->rcep_p.zone == NULL)
1328 1328                  return (0);
1329 1329  
1330 1330          e->rcep_p.zone->zone_shares = nv;
1331 1331          return (0);
1332 1332  }
1333 1333  
1334 1334  static rctl_ops_t zone_cpu_shares_ops = {
1335 1335          rcop_no_action,
1336 1336          zone_cpu_shares_usage,
1337 1337          zone_cpu_shares_set,
1338 1338          rcop_no_test
1339 1339  };
1340 1340  
1341 1341  /*
1342 1342   * zone.cpu-cap resource control support.
1343 1343   */
1344 1344  /*ARGSUSED*/
1345 1345  static rctl_qty_t
1346 1346  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1347 1347  {
1348 1348          ASSERT(MUTEX_HELD(&p->p_lock));
1349 1349          return (cpucaps_zone_get(p->p_zone));
1350 1350  }
1351 1351  
1352 1352  /*ARGSUSED*/
1353 1353  static int
1354 1354  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1355 1355      rctl_qty_t nv)
1356 1356  {
1357 1357          zone_t *zone = e->rcep_p.zone;
1358 1358  
1359 1359          ASSERT(MUTEX_HELD(&p->p_lock));
1360 1360          ASSERT(e->rcep_t == RCENTITY_ZONE);
1361 1361  
1362 1362          if (zone == NULL)
1363 1363                  return (0);
1364 1364  
1365 1365          /*
1366 1366           * set cap to the new value.
1367 1367           */
1368 1368          return (cpucaps_zone_set(zone, nv));
1369 1369  }
1370 1370  
1371 1371  static rctl_ops_t zone_cpu_cap_ops = {
1372 1372          rcop_no_action,
1373 1373          zone_cpu_cap_get,
1374 1374          zone_cpu_cap_set,
1375 1375          rcop_no_test
1376 1376  };
1377 1377  
1378 1378  /*ARGSUSED*/
1379 1379  static rctl_qty_t
1380 1380  zone_lwps_usage(rctl_t *r, proc_t *p)
1381 1381  {
1382 1382          rctl_qty_t nlwps;
1383 1383          zone_t *zone = p->p_zone;
1384 1384  
1385 1385          ASSERT(MUTEX_HELD(&p->p_lock));
1386 1386  
1387 1387          mutex_enter(&zone->zone_nlwps_lock);
1388 1388          nlwps = zone->zone_nlwps;
1389 1389          mutex_exit(&zone->zone_nlwps_lock);
1390 1390  
1391 1391          return (nlwps);
1392 1392  }
1393 1393  
1394 1394  /*ARGSUSED*/
1395 1395  static int
1396 1396  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1397 1397      rctl_qty_t incr, uint_t flags)
1398 1398  {
1399 1399          rctl_qty_t nlwps;
1400 1400  
1401 1401          ASSERT(MUTEX_HELD(&p->p_lock));
1402 1402          ASSERT(e->rcep_t == RCENTITY_ZONE);
1403 1403          if (e->rcep_p.zone == NULL)
1404 1404                  return (0);
1405 1405          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1406 1406          nlwps = e->rcep_p.zone->zone_nlwps;
1407 1407  
1408 1408          if (nlwps + incr > rcntl->rcv_value)
1409 1409                  return (1);
1410 1410  
1411 1411          return (0);
1412 1412  }
1413 1413  
1414 1414  /*ARGSUSED*/
1415 1415  static int
1416 1416  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1417 1417  {
1418 1418          ASSERT(MUTEX_HELD(&p->p_lock));
1419 1419          ASSERT(e->rcep_t == RCENTITY_ZONE);
1420 1420          if (e->rcep_p.zone == NULL)
1421 1421                  return (0);
1422 1422          e->rcep_p.zone->zone_nlwps_ctl = nv;
1423 1423          return (0);
1424 1424  }
1425 1425  
1426 1426  static rctl_ops_t zone_lwps_ops = {
1427 1427          rcop_no_action,
1428 1428          zone_lwps_usage,
1429 1429          zone_lwps_set,
1430 1430          zone_lwps_test,
1431 1431  };
1432 1432  
1433 1433  /*ARGSUSED*/
1434 1434  static rctl_qty_t
1435 1435  zone_procs_usage(rctl_t *r, proc_t *p)
1436 1436  {
1437 1437          rctl_qty_t nprocs;
1438 1438          zone_t *zone = p->p_zone;
1439 1439  
1440 1440          ASSERT(MUTEX_HELD(&p->p_lock));
1441 1441  
1442 1442          mutex_enter(&zone->zone_nlwps_lock);
1443 1443          nprocs = zone->zone_nprocs;
1444 1444          mutex_exit(&zone->zone_nlwps_lock);
1445 1445  
1446 1446          return (nprocs);
1447 1447  }
1448 1448  
1449 1449  /*ARGSUSED*/
1450 1450  static int
1451 1451  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1452 1452      rctl_qty_t incr, uint_t flags)
1453 1453  {
1454 1454          rctl_qty_t nprocs;
1455 1455  
1456 1456          ASSERT(MUTEX_HELD(&p->p_lock));
1457 1457          ASSERT(e->rcep_t == RCENTITY_ZONE);
1458 1458          if (e->rcep_p.zone == NULL)
1459 1459                  return (0);
1460 1460          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1461 1461          nprocs = e->rcep_p.zone->zone_nprocs;
1462 1462  
1463 1463          if (nprocs + incr > rcntl->rcv_value)
1464 1464                  return (1);
1465 1465  
1466 1466          return (0);
1467 1467  }
1468 1468  
1469 1469  /*ARGSUSED*/
1470 1470  static int
1471 1471  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1472 1472  {
1473 1473          ASSERT(MUTEX_HELD(&p->p_lock));
1474 1474          ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 1475          if (e->rcep_p.zone == NULL)
1476 1476                  return (0);
1477 1477          e->rcep_p.zone->zone_nprocs_ctl = nv;
1478 1478          return (0);
1479 1479  }
1480 1480  
1481 1481  static rctl_ops_t zone_procs_ops = {
1482 1482          rcop_no_action,
1483 1483          zone_procs_usage,
1484 1484          zone_procs_set,
1485 1485          zone_procs_test,
1486 1486  };
1487 1487  
1488 1488  /*ARGSUSED*/
1489 1489  static int
1490 1490  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1491 1491      rctl_qty_t incr, uint_t flags)
1492 1492  {
1493 1493          rctl_qty_t v;
1494 1494          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1495          ASSERT(e->rcep_t == RCENTITY_ZONE);
1496 1496          v = e->rcep_p.zone->zone_shmmax + incr;
1497 1497          if (v > rval->rcv_value)
1498 1498                  return (1);
1499 1499          return (0);
1500 1500  }
1501 1501  
1502 1502  static rctl_ops_t zone_shmmax_ops = {
1503 1503          rcop_no_action,
1504 1504          rcop_no_usage,
1505 1505          rcop_no_set,
1506 1506          zone_shmmax_test
1507 1507  };
1508 1508  
1509 1509  /*ARGSUSED*/
1510 1510  static int
1511 1511  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1512 1512      rctl_qty_t incr, uint_t flags)
1513 1513  {
1514 1514          rctl_qty_t v;
1515 1515          ASSERT(MUTEX_HELD(&p->p_lock));
1516 1516          ASSERT(e->rcep_t == RCENTITY_ZONE);
1517 1517          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1518 1518          if (v > rval->rcv_value)
1519 1519                  return (1);
1520 1520          return (0);
1521 1521  }
1522 1522  
1523 1523  static rctl_ops_t zone_shmmni_ops = {
1524 1524          rcop_no_action,
1525 1525          rcop_no_usage,
1526 1526          rcop_no_set,
1527 1527          zone_shmmni_test
1528 1528  };
1529 1529  
1530 1530  /*ARGSUSED*/
1531 1531  static int
1532 1532  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1533 1533      rctl_qty_t incr, uint_t flags)
1534 1534  {
1535 1535          rctl_qty_t v;
1536 1536          ASSERT(MUTEX_HELD(&p->p_lock));
1537 1537          ASSERT(e->rcep_t == RCENTITY_ZONE);
1538 1538          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1539 1539          if (v > rval->rcv_value)
1540 1540                  return (1);
1541 1541          return (0);
1542 1542  }
1543 1543  
1544 1544  static rctl_ops_t zone_semmni_ops = {
1545 1545          rcop_no_action,
1546 1546          rcop_no_usage,
1547 1547          rcop_no_set,
1548 1548          zone_semmni_test
1549 1549  };
1550 1550  
1551 1551  /*ARGSUSED*/
1552 1552  static int
1553 1553  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1554 1554      rctl_qty_t incr, uint_t flags)
1555 1555  {
1556 1556          rctl_qty_t v;
1557 1557          ASSERT(MUTEX_HELD(&p->p_lock));
1558 1558          ASSERT(e->rcep_t == RCENTITY_ZONE);
1559 1559          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1560 1560          if (v > rval->rcv_value)
1561 1561                  return (1);
1562 1562          return (0);
1563 1563  }
1564 1564  
1565 1565  static rctl_ops_t zone_msgmni_ops = {
1566 1566          rcop_no_action,
1567 1567          rcop_no_usage,
1568 1568          rcop_no_set,
1569 1569          zone_msgmni_test
1570 1570  };
1571 1571  
1572 1572  /*ARGSUSED*/
1573 1573  static rctl_qty_t
1574 1574  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1575 1575  {
1576 1576          rctl_qty_t q;
1577 1577          ASSERT(MUTEX_HELD(&p->p_lock));
1578 1578          mutex_enter(&p->p_zone->zone_mem_lock);
1579 1579          q = p->p_zone->zone_locked_mem;
1580 1580          mutex_exit(&p->p_zone->zone_mem_lock);
1581 1581          return (q);
1582 1582  }
1583 1583  
1584 1584  /*ARGSUSED*/
1585 1585  static int
1586 1586  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1587 1587      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1588 1588  {
1589 1589          rctl_qty_t q;
1590 1590          zone_t *z;
1591 1591  
1592 1592          z = e->rcep_p.zone;
1593 1593          ASSERT(MUTEX_HELD(&p->p_lock));
1594 1594          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1595 1595          q = z->zone_locked_mem;
1596 1596          if (q + incr > rcntl->rcv_value)
1597 1597                  return (1);
1598 1598          return (0);
1599 1599  }
1600 1600  
1601 1601  /*ARGSUSED*/
1602 1602  static int
1603 1603  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1604 1604      rctl_qty_t nv)
1605 1605  {
1606 1606          ASSERT(MUTEX_HELD(&p->p_lock));
1607 1607          ASSERT(e->rcep_t == RCENTITY_ZONE);
1608 1608          if (e->rcep_p.zone == NULL)
1609 1609                  return (0);
1610 1610          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1611 1611          return (0);
1612 1612  }
1613 1613  
1614 1614  static rctl_ops_t zone_locked_mem_ops = {
1615 1615          rcop_no_action,
1616 1616          zone_locked_mem_usage,
1617 1617          zone_locked_mem_set,
1618 1618          zone_locked_mem_test
1619 1619  };
1620 1620  
1621 1621  /*ARGSUSED*/
1622 1622  static rctl_qty_t
1623 1623  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1624 1624  {
1625 1625          rctl_qty_t q;
1626 1626          zone_t *z = p->p_zone;
1627 1627  
1628 1628          ASSERT(MUTEX_HELD(&p->p_lock));
1629 1629          mutex_enter(&z->zone_mem_lock);
1630 1630          q = z->zone_max_swap;
1631 1631          mutex_exit(&z->zone_mem_lock);
1632 1632          return (q);
1633 1633  }
1634 1634  
1635 1635  /*ARGSUSED*/
1636 1636  static int
1637 1637  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1638 1638      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1639 1639  {
1640 1640          rctl_qty_t q;
1641 1641          zone_t *z;
1642 1642  
1643 1643          z = e->rcep_p.zone;
1644 1644          ASSERT(MUTEX_HELD(&p->p_lock));
1645 1645          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1646 1646          q = z->zone_max_swap;
1647 1647          if (q + incr > rcntl->rcv_value)
1648 1648                  return (1);
1649 1649          return (0);
1650 1650  }
1651 1651  
1652 1652  /*ARGSUSED*/
1653 1653  static int
1654 1654  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1655 1655      rctl_qty_t nv)
1656 1656  {
1657 1657          ASSERT(MUTEX_HELD(&p->p_lock));
1658 1658          ASSERT(e->rcep_t == RCENTITY_ZONE);
1659 1659          if (e->rcep_p.zone == NULL)
1660 1660                  return (0);
1661 1661          e->rcep_p.zone->zone_max_swap_ctl = nv;
1662 1662          return (0);
1663 1663  }
1664 1664  
1665 1665  static rctl_ops_t zone_max_swap_ops = {
1666 1666          rcop_no_action,
1667 1667          zone_max_swap_usage,
1668 1668          zone_max_swap_set,
1669 1669          zone_max_swap_test
1670 1670  };
1671 1671  
1672 1672  /*ARGSUSED*/
1673 1673  static rctl_qty_t
1674 1674  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1675 1675  {
1676 1676          rctl_qty_t q;
1677 1677          zone_t *z = p->p_zone;
1678 1678  
1679 1679          ASSERT(MUTEX_HELD(&p->p_lock));
1680 1680          mutex_enter(&z->zone_rctl_lock);
1681 1681          q = z->zone_max_lofi;
1682 1682          mutex_exit(&z->zone_rctl_lock);
1683 1683          return (q);
1684 1684  }
1685 1685  
1686 1686  /*ARGSUSED*/
1687 1687  static int
1688 1688  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1689 1689      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1690 1690  {
1691 1691          rctl_qty_t q;
1692 1692          zone_t *z;
1693 1693  
1694 1694          z = e->rcep_p.zone;
1695 1695          ASSERT(MUTEX_HELD(&p->p_lock));
1696 1696          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1697 1697          q = z->zone_max_lofi;
1698 1698          if (q + incr > rcntl->rcv_value)
1699 1699                  return (1);
1700 1700          return (0);
1701 1701  }
1702 1702  
1703 1703  /*ARGSUSED*/
1704 1704  static int
1705 1705  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1706 1706      rctl_qty_t nv)
1707 1707  {
1708 1708          ASSERT(MUTEX_HELD(&p->p_lock));
1709 1709          ASSERT(e->rcep_t == RCENTITY_ZONE);
1710 1710          if (e->rcep_p.zone == NULL)
1711 1711                  return (0);
1712 1712          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1713 1713          return (0);
1714 1714  }
1715 1715  
1716 1716  static rctl_ops_t zone_max_lofi_ops = {
1717 1717          rcop_no_action,
1718 1718          zone_max_lofi_usage,
1719 1719          zone_max_lofi_set,
1720 1720          zone_max_lofi_test
1721 1721  };
1722 1722  
1723 1723  /*
1724 1724   * Helper function to brand the zone with a unique ID.
1725 1725   */
1726 1726  static void
1727 1727  zone_uniqid(zone_t *zone)
1728 1728  {
1729 1729          static uint64_t uniqid = 0;
1730 1730  
1731 1731          ASSERT(MUTEX_HELD(&zonehash_lock));
1732 1732          zone->zone_uniqid = uniqid++;
1733 1733  }
1734 1734  
1735 1735  /*
1736 1736   * Returns a held pointer to the "kcred" for the specified zone.
1737 1737   */
1738 1738  struct cred *
1739 1739  zone_get_kcred(zoneid_t zoneid)
1740 1740  {
1741 1741          zone_t *zone;
1742 1742          cred_t *cr;
1743 1743  
1744 1744          if ((zone = zone_find_by_id(zoneid)) == NULL)
1745 1745                  return (NULL);
1746 1746          cr = zone->zone_kcred;
1747 1747          crhold(cr);
1748 1748          zone_rele(zone);
1749 1749          return (cr);
1750 1750  }
1751 1751  
1752 1752  static int
1753 1753  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1754 1754  {
1755 1755          zone_t *zone = ksp->ks_private;
1756 1756          zone_kstat_t *zk = ksp->ks_data;
1757 1757  
1758 1758          if (rw == KSTAT_WRITE)
1759 1759                  return (EACCES);
1760 1760  
1761 1761          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1762 1762          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1763 1763          return (0);
1764 1764  }
1765 1765  
1766 1766  static int
1767 1767  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1768 1768  {
1769 1769          zone_t *zone = ksp->ks_private;
1770 1770          zone_kstat_t *zk = ksp->ks_data;
1771 1771  
1772 1772          if (rw == KSTAT_WRITE)
1773 1773                  return (EACCES);
1774 1774  
1775 1775          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1776 1776          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1777 1777          return (0);
1778 1778  }
1779 1779  
1780 1780  static int
1781 1781  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1782 1782  {
1783 1783          zone_t *zone = ksp->ks_private;
1784 1784          zone_kstat_t *zk = ksp->ks_data;
1785 1785  
1786 1786          if (rw == KSTAT_WRITE)
1787 1787                  return (EACCES);
1788 1788  
1789 1789          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1790 1790          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1791 1791          return (0);
1792 1792  }
1793 1793  
1794 1794  static kstat_t *
1795 1795  zone_kstat_create_common(zone_t *zone, char *name,
1796 1796      int (*updatefunc) (kstat_t *, int))
1797 1797  {
1798 1798          kstat_t *ksp;
1799 1799          zone_kstat_t *zk;
1800 1800  
1801 1801          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1802 1802              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1803 1803              KSTAT_FLAG_VIRTUAL);
1804 1804  
1805 1805          if (ksp == NULL)
1806 1806                  return (NULL);
1807 1807  
1808 1808          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1809 1809          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1810 1810          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1811 1811          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1812 1812          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1813 1813          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1814 1814          ksp->ks_update = updatefunc;
1815 1815          ksp->ks_private = zone;
1816 1816          kstat_install(ksp);
1817 1817          return (ksp);
1818 1818  }
1819 1819  
1820 1820  
1821 1821  static int
1822 1822  zone_mcap_kstat_update(kstat_t *ksp, int rw)
1823 1823  {
1824 1824          zone_t *zone = ksp->ks_private;
1825 1825          zone_mcap_kstat_t *zmp = ksp->ks_data;
1826 1826  
1827 1827          if (rw == KSTAT_WRITE)
1828 1828                  return (EACCES);
1829 1829  
1830 1830          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1831 1831          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1832 1832          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1833 1833          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1834 1834          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1835 1835  
1836 1836          return (0);
1837 1837  }
1838 1838  
1839 1839  static kstat_t *
1840 1840  zone_mcap_kstat_create(zone_t *zone)
1841 1841  {
1842 1842          kstat_t *ksp;
1843 1843          zone_mcap_kstat_t *zmp;
1844 1844  
1845 1845          if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1846 1846              zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1847 1847              sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1848 1848              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1849 1849                  return (NULL);
1850 1850  
1851 1851          if (zone->zone_id != GLOBAL_ZONEID)
1852 1852                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1853 1853  
1854 1854          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1855 1855          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1856 1856          ksp->ks_lock = &zone->zone_mcap_lock;
1857 1857          zone->zone_mcap_stats = zmp;
1858 1858  
1859 1859          /* The kstat "name" field is not large enough for a full zonename */
1860 1860          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1861 1861          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1862 1862          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1863 1863          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1864 1864          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1865 1865          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1866 1866          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1867 1867              KSTAT_DATA_UINT64);
1868 1868  
1869 1869          ksp->ks_update = zone_mcap_kstat_update;
1870 1870          ksp->ks_private = zone;
1871 1871  
1872 1872          kstat_install(ksp);
1873 1873          return (ksp);
1874 1874  }
1875 1875  
1876 1876  static int
1877 1877  zone_misc_kstat_update(kstat_t *ksp, int rw)
1878 1878  {
1879 1879          zone_t *zone = ksp->ks_private;
1880 1880          zone_misc_kstat_t *zmp = ksp->ks_data;
1881 1881          hrtime_t tmp;
1882 1882  
1883 1883          if (rw == KSTAT_WRITE)
1884 1884                  return (EACCES);
1885 1885  
1886 1886          tmp = zone->zone_utime;
1887 1887          scalehrtime(&tmp);
1888 1888          zmp->zm_utime.value.ui64 = tmp;
1889 1889          tmp = zone->zone_stime;
1890 1890          scalehrtime(&tmp);
1891 1891          zmp->zm_stime.value.ui64 = tmp;
1892 1892          tmp = zone->zone_wtime;
1893 1893          scalehrtime(&tmp);
1894 1894          zmp->zm_wtime.value.ui64 = tmp;
1895 1895  
1896 1896          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1897 1897          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1898 1898          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1899 1899  
1900 1900          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1901 1901          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1902 1902          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1903 1903          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1904 1904  
1905 1905          zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1906 1906  
1907 1907          zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1908 1908          zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1909 1909  
1910 1910          return (0);
1911 1911  }
1912 1912  
1913 1913  static kstat_t *
1914 1914  zone_misc_kstat_create(zone_t *zone)
1915 1915  {
1916 1916          kstat_t *ksp;
1917 1917          zone_misc_kstat_t *zmp;
1918 1918  
1919 1919          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1920 1920              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1921 1921              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1922 1922              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1923 1923                  return (NULL);
1924 1924  
1925 1925          if (zone->zone_id != GLOBAL_ZONEID)
1926 1926                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1927 1927  
1928 1928          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1929 1929          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1930 1930          ksp->ks_lock = &zone->zone_misc_lock;
1931 1931          zone->zone_misc_stats = zmp;
1932 1932  
1933 1933          /* The kstat "name" field is not large enough for a full zonename */
1934 1934          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1935 1935          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1936 1936          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1937 1937          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1938 1938          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1939 1939          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1940 1940          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1941 1941          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1942 1942              KSTAT_DATA_UINT32);
1943 1943          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1944 1944          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1945 1945              KSTAT_DATA_UINT32);
1946 1946          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1947 1947          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1948 1948          kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1949 1949              KSTAT_DATA_UINT32);
1950 1950          kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1951 1951          kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1952 1952  
1953 1953          ksp->ks_update = zone_misc_kstat_update;
1954 1954          ksp->ks_private = zone;
1955 1955  
1956 1956          kstat_install(ksp);
1957 1957          return (ksp);
1958 1958  }
1959 1959  
1960 1960  static void
1961 1961  zone_kstat_create(zone_t *zone)
1962 1962  {
1963 1963          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1964 1964              "lockedmem", zone_lockedmem_kstat_update);
1965 1965          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1966 1966              "swapresv", zone_swapresv_kstat_update);
1967 1967          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1968 1968              "nprocs", zone_nprocs_kstat_update);
1969 1969  
1970 1970          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
1971 1971                  zone->zone_mcap_stats = kmem_zalloc(
1972 1972                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
1973 1973          }
1974 1974  
1975 1975          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
1976 1976                  zone->zone_misc_stats = kmem_zalloc(
1977 1977                      sizeof (zone_misc_kstat_t), KM_SLEEP);
1978 1978          }
1979 1979  }
1980 1980  
1981 1981  static void
1982 1982  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
1983 1983  {
1984 1984          void *data;
1985 1985  
1986 1986          if (*pkstat != NULL) {
1987 1987                  data = (*pkstat)->ks_data;
1988 1988                  kstat_delete(*pkstat);
1989 1989                  kmem_free(data, datasz);
1990 1990                  *pkstat = NULL;
1991 1991          }
1992 1992  }
1993 1993  
1994 1994  static void
1995 1995  zone_kstat_delete(zone_t *zone)
1996 1996  {
1997 1997          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
1998 1998              sizeof (zone_kstat_t));
1999 1999          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2000 2000              sizeof (zone_kstat_t));
2001 2001          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2002 2002              sizeof (zone_kstat_t));
2003 2003          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2004 2004              sizeof (zone_mcap_kstat_t));
2005 2005          zone_kstat_delete_common(&zone->zone_misc_ksp,
2006 2006              sizeof (zone_misc_kstat_t));
2007 2007  }
2008 2008  
2009 2009  /*
2010 2010   * Called very early on in boot to initialize the ZSD list so that
2011 2011   * zone_key_create() can be called before zone_init().  It also initializes
2012 2012   * portions of zone0 which may be used before zone_init() is called.  The
2013 2013   * variable "global_zone" will be set when zone0 is fully initialized by
2014 2014   * zone_init().
2015 2015   */
2016 2016  void
2017 2017  zone_zsd_init(void)
2018 2018  {
2019 2019          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2020 2020          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2021 2021          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2022 2022              offsetof(struct zsd_entry, zsd_linkage));
2023 2023          list_create(&zone_active, sizeof (zone_t),
2024 2024              offsetof(zone_t, zone_linkage));
2025 2025          list_create(&zone_deathrow, sizeof (zone_t),
2026 2026              offsetof(zone_t, zone_linkage));
2027 2027  
2028 2028          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2029 2029          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2030 2030          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2031 2031          zone0.zone_shares = 1;
2032 2032          zone0.zone_nlwps = 0;
2033 2033          zone0.zone_nlwps_ctl = INT_MAX;
2034 2034          zone0.zone_nprocs = 0;
2035 2035          zone0.zone_nprocs_ctl = INT_MAX;
2036 2036          zone0.zone_locked_mem = 0;
2037 2037          zone0.zone_locked_mem_ctl = UINT64_MAX;
2038 2038          ASSERT(zone0.zone_max_swap == 0);
2039 2039          zone0.zone_max_swap_ctl = UINT64_MAX;
2040 2040          zone0.zone_max_lofi = 0;

↓ open down ↓

2040 lines elided

↑ open up ↑

2041 2041          zone0.zone_max_lofi_ctl = UINT64_MAX;
2042 2042          zone0.zone_shmmax = 0;
2043 2043          zone0.zone_ipc.ipcq_shmmni = 0;
2044 2044          zone0.zone_ipc.ipcq_semmni = 0;
2045 2045          zone0.zone_ipc.ipcq_msgmni = 0;
2046 2046          zone0.zone_name = GLOBAL_ZONENAME;
2047 2047          zone0.zone_nodename = utsname.nodename;
2048 2048          zone0.zone_domain = srpc_domain;
2049 2049          zone0.zone_hostid = HW_INVALID_HOSTID;
2050 2050          zone0.zone_fs_allowed = NULL;
     2051 +        psecflags_default(&zone0.zone_secflags);
2051 2052          zone0.zone_ref = 1;
2052 2053          zone0.zone_id = GLOBAL_ZONEID;
2053 2054          zone0.zone_status = ZONE_IS_RUNNING;
2054 2055          zone0.zone_rootpath = "/";
2055 2056          zone0.zone_rootpathlen = 2;
2056 2057          zone0.zone_psetid = ZONE_PS_INVAL;
2057 2058          zone0.zone_ncpus = 0;
2058 2059          zone0.zone_ncpus_online = 0;
2059 2060          zone0.zone_proc_initpid = 1;
2060 2061          zone0.zone_initname = initname;

2061 2062          zone0.zone_lockedmem_kstat = NULL;
2062 2063          zone0.zone_swapresv_kstat = NULL;
2063 2064          zone0.zone_nprocs_kstat = NULL;
2064 2065  
2065 2066          zone0.zone_stime = 0;
2066 2067          zone0.zone_utime = 0;
2067 2068          zone0.zone_wtime = 0;
2068 2069  
2069 2070          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2070 2071              offsetof(zone_ref_t, zref_linkage));
2071 2072          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2072 2073              offsetof(struct zsd_entry, zsd_linkage));
2073 2074          list_insert_head(&zone_active, &zone0);
2074 2075  
2075 2076          /*
2076 2077           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2077 2078           * to anything meaningful.  It is assigned to be 'rootdir' in
2078 2079           * vfs_mountroot().
2079 2080           */
2080 2081          zone0.zone_rootvp = NULL;
2081 2082          zone0.zone_vfslist = NULL;
2082 2083          zone0.zone_bootargs = initargs;
2083 2084          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2084 2085          /*
2085 2086           * The global zone has all privileges
2086 2087           */
2087 2088          priv_fillset(zone0.zone_privset);
2088 2089          /*
2089 2090           * Add p0 to the global zone
2090 2091           */
2091 2092          zone0.zone_zsched = &p0;
2092 2093          p0.p_zone = &zone0;
2093 2094  }
2094 2095  
2095 2096  /*
2096 2097   * Compute a hash value based on the contents of the label and the DOI.  The
2097 2098   * hash algorithm is somewhat arbitrary, but is based on the observation that
2098 2099   * humans will likely pick labels that differ by amounts that work out to be
2099 2100   * multiples of the number of hash chains, and thus stirring in some primes
2100 2101   * should help.
2101 2102   */
2102 2103  static uint_t
2103 2104  hash_bylabel(void *hdata, mod_hash_key_t key)
2104 2105  {
2105 2106          const ts_label_t *lab = (ts_label_t *)key;
2106 2107          const uint32_t *up, *ue;
2107 2108          uint_t hash;
2108 2109          int i;
2109 2110  
2110 2111          _NOTE(ARGUNUSED(hdata));
2111 2112  
2112 2113          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2113 2114          /* we depend on alignment of label, but not representation */
2114 2115          up = (const uint32_t *)&lab->tsl_label;
2115 2116          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2116 2117          i = 1;
2117 2118          while (up < ue) {
2118 2119                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2119 2120                  hash += *up + (*up << ((i % 16) + 1));
2120 2121                  up++;
2121 2122                  i++;
2122 2123          }
2123 2124          return (hash);
2124 2125  }
2125 2126  
2126 2127  /*
2127 2128   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2128 2129   * equal).  This may need to be changed if less than / greater than is ever
2129 2130   * needed.
2130 2131   */
2131 2132  static int
2132 2133  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2133 2134  {
2134 2135          ts_label_t *lab1 = (ts_label_t *)key1;
2135 2136          ts_label_t *lab2 = (ts_label_t *)key2;
2136 2137  
2137 2138          return (label_equal(lab1, lab2) ? 0 : 1);
2138 2139  }
2139 2140  
2140 2141  /*
2141 2142   * Called by main() to initialize the zones framework.
2142 2143   */
2143 2144  void
2144 2145  zone_init(void)
2145 2146  {
2146 2147          rctl_dict_entry_t *rde;
2147 2148          rctl_val_t *dval;
2148 2149          rctl_set_t *set;
2149 2150          rctl_alloc_gp_t *gp;
2150 2151          rctl_entity_p_t e;
2151 2152          int res;
2152 2153  
2153 2154          ASSERT(curproc == &p0);
2154 2155  
2155 2156          /*
2156 2157           * Create ID space for zone IDs.  ID 0 is reserved for the
2157 2158           * global zone.
2158 2159           */
2159 2160          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2160 2161  
2161 2162          /*
2162 2163           * Initialize generic zone resource controls, if any.
2163 2164           */
2164 2165          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2165 2166              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2166 2167              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2167 2168              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2168 2169  
2169 2170          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2170 2171              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2171 2172              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2172 2173              RCTL_GLOBAL_INFINITE,
2173 2174              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2174 2175  
2175 2176          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2176 2177              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2177 2178              INT_MAX, INT_MAX, &zone_lwps_ops);
2178 2179  
2179 2180          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2180 2181              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2181 2182              INT_MAX, INT_MAX, &zone_procs_ops);
2182 2183  
2183 2184          /*
2184 2185           * System V IPC resource controls
2185 2186           */
2186 2187          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2187 2188              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2188 2189              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2189 2190  
2190 2191          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2191 2192              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2192 2193              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2193 2194  
2194 2195          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2195 2196              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2196 2197              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2197 2198  
2198 2199          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2199 2200              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2200 2201              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2201 2202  
2202 2203          /*
2203 2204           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2204 2205           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2205 2206           */
2206 2207          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2207 2208          bzero(dval, sizeof (rctl_val_t));
2208 2209          dval->rcv_value = 1;
2209 2210          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2210 2211          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2211 2212          dval->rcv_action_recip_pid = -1;
2212 2213  
2213 2214          rde = rctl_dict_lookup("zone.cpu-shares");
2214 2215          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2215 2216  
2216 2217          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2217 2218              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2218 2219              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2219 2220              &zone_locked_mem_ops);
2220 2221  
2221 2222          rc_zone_max_swap = rctl_register("zone.max-swap",
2222 2223              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2223 2224              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2224 2225              &zone_max_swap_ops);
2225 2226  
2226 2227          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2227 2228              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2228 2229              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2229 2230              &zone_max_lofi_ops);
2230 2231  
2231 2232          /*
2232 2233           * Initialize the ``global zone''.
2233 2234           */
2234 2235          set = rctl_set_create();
2235 2236          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2236 2237          mutex_enter(&p0.p_lock);
2237 2238          e.rcep_p.zone = &zone0;
2238 2239          e.rcep_t = RCENTITY_ZONE;
2239 2240          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2240 2241              gp);
2241 2242  
2242 2243          zone0.zone_nlwps = p0.p_lwpcnt;
2243 2244          zone0.zone_nprocs = 1;
2244 2245          zone0.zone_ntasks = 1;
2245 2246          mutex_exit(&p0.p_lock);
2246 2247          zone0.zone_restart_init = B_TRUE;
2247 2248          zone0.zone_brand = &native_brand;
2248 2249          rctl_prealloc_destroy(gp);
2249 2250          /*
2250 2251           * pool_default hasn't been initialized yet, so we let pool_init()
2251 2252           * take care of making sure the global zone is in the default pool.
2252 2253           */
2253 2254  
2254 2255          /*
2255 2256           * Initialize global zone kstats
2256 2257           */
2257 2258          zone_kstat_create(&zone0);
2258 2259  
2259 2260          /*
2260 2261           * Initialize zone label.
2261 2262           * mlp are initialized when tnzonecfg is loaded.
2262 2263           */
2263 2264          zone0.zone_slabel = l_admin_low;
2264 2265          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2265 2266          label_hold(l_admin_low);
2266 2267  
2267 2268          /*
2268 2269           * Initialise the lock for the database structure used by mntfs.
2269 2270           */
2270 2271          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2271 2272  
2272 2273          mutex_enter(&zonehash_lock);
2273 2274          zone_uniqid(&zone0);
2274 2275          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2275 2276  
2276 2277          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2277 2278              mod_hash_null_valdtor);
2278 2279          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2279 2280              zone_hash_size, mod_hash_null_valdtor);
2280 2281          /*
2281 2282           * maintain zonehashbylabel only for labeled systems
2282 2283           */
2283 2284          if (is_system_labeled())
2284 2285                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2285 2286                      zone_hash_size, mod_hash_null_keydtor,
2286 2287                      mod_hash_null_valdtor, hash_bylabel, NULL,
2287 2288                      hash_labelkey_cmp, KM_SLEEP);
2288 2289          zonecount = 1;
2289 2290  
2290 2291          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2291 2292              (mod_hash_val_t)&zone0);
2292 2293          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2293 2294              (mod_hash_val_t)&zone0);
2294 2295          if (is_system_labeled()) {
2295 2296                  zone0.zone_flags |= ZF_HASHED_LABEL;
2296 2297                  (void) mod_hash_insert(zonehashbylabel,
2297 2298                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2298 2299          }
2299 2300          mutex_exit(&zonehash_lock);
2300 2301  
2301 2302          /*
2302 2303           * We avoid setting zone_kcred until now, since kcred is initialized
2303 2304           * sometime after zone_zsd_init() and before zone_init().
2304 2305           */
2305 2306          zone0.zone_kcred = kcred;
2306 2307          /*
2307 2308           * The global zone is fully initialized (except for zone_rootvp which
2308 2309           * will be set when the root filesystem is mounted).
2309 2310           */
2310 2311          global_zone = &zone0;
2311 2312  
2312 2313          /*
2313 2314           * Setup an event channel to send zone status change notifications on
2314 2315           */
2315 2316          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2316 2317              EVCH_CREAT);
2317 2318  
2318 2319          if (res)
2319 2320                  panic("Sysevent_evc_bind failed during zone setup.\n");
2320 2321  
2321 2322  }
2322 2323  
2323 2324  static void
2324 2325  zone_free(zone_t *zone)
2325 2326  {
2326 2327          ASSERT(zone != global_zone);
2327 2328          ASSERT(zone->zone_ntasks == 0);
2328 2329          ASSERT(zone->zone_nlwps == 0);
2329 2330          ASSERT(zone->zone_nprocs == 0);
2330 2331          ASSERT(zone->zone_cred_ref == 0);
2331 2332          ASSERT(zone->zone_kcred == NULL);
2332 2333          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2333 2334              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2334 2335          ASSERT(list_is_empty(&zone->zone_ref_list));
2335 2336  
2336 2337          /*
2337 2338           * Remove any zone caps.
2338 2339           */
2339 2340          cpucaps_zone_remove(zone);
2340 2341  
2341 2342          ASSERT(zone->zone_cpucap == NULL);
2342 2343  
2343 2344          /* remove from deathrow list */
2344 2345          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2345 2346                  ASSERT(zone->zone_ref == 0);
2346 2347                  mutex_enter(&zone_deathrow_lock);
2347 2348                  list_remove(&zone_deathrow, zone);
2348 2349                  mutex_exit(&zone_deathrow_lock);
2349 2350          }
2350 2351  
2351 2352          list_destroy(&zone->zone_ref_list);
2352 2353          zone_free_zsd(zone);
2353 2354          zone_free_datasets(zone);
2354 2355          list_destroy(&zone->zone_dl_list);
2355 2356  
2356 2357          if (zone->zone_rootvp != NULL)
2357 2358                  VN_RELE(zone->zone_rootvp);
2358 2359          if (zone->zone_rootpath)
2359 2360                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2360 2361          if (zone->zone_name != NULL)
2361 2362                  kmem_free(zone->zone_name, ZONENAME_MAX);
2362 2363          if (zone->zone_slabel != NULL)
2363 2364                  label_rele(zone->zone_slabel);
2364 2365          if (zone->zone_nodename != NULL)
2365 2366                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2366 2367          if (zone->zone_domain != NULL)
2367 2368                  kmem_free(zone->zone_domain, _SYS_NMLN);
2368 2369          if (zone->zone_privset != NULL)
2369 2370                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2370 2371          if (zone->zone_rctls != NULL)
2371 2372                  rctl_set_free(zone->zone_rctls);
2372 2373          if (zone->zone_bootargs != NULL)
2373 2374                  strfree(zone->zone_bootargs);
2374 2375          if (zone->zone_initname != NULL)
2375 2376                  strfree(zone->zone_initname);
2376 2377          if (zone->zone_fs_allowed != NULL)
2377 2378                  strfree(zone->zone_fs_allowed);
2378 2379          if (zone->zone_pfexecd != NULL)
2379 2380                  klpd_freelist(&zone->zone_pfexecd);
2380 2381          id_free(zoneid_space, zone->zone_id);
2381 2382          mutex_destroy(&zone->zone_lock);
2382 2383          cv_destroy(&zone->zone_cv);
2383 2384          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2384 2385          rw_destroy(&zone->zone_mntfs_db_lock);
2385 2386          kmem_free(zone, sizeof (zone_t));
2386 2387  }
2387 2388  
2388 2389  /*
2389 2390   * See block comment at the top of this file for information about zone
2390 2391   * status values.
2391 2392   */
2392 2393  /*
2393 2394   * Convenience function for setting zone status.
2394 2395   */
2395 2396  static void
2396 2397  zone_status_set(zone_t *zone, zone_status_t status)
2397 2398  {
2398 2399  
2399 2400          nvlist_t *nvl = NULL;
2400 2401          ASSERT(MUTEX_HELD(&zone_status_lock));
2401 2402          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2402 2403              status >= zone_status_get(zone));
2403 2404  
2404 2405          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2405 2406              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2406 2407              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2407 2408              zone_status_table[status]) ||
2408 2409              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2409 2410              zone_status_table[zone->zone_status]) ||
2410 2411              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2411 2412              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2412 2413              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2413 2414              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2414 2415  #ifdef DEBUG
2415 2416                  (void) printf(
2416 2417                      "Failed to allocate and send zone state change event.\n");
2417 2418  #endif
2418 2419          }
2419 2420          nvlist_free(nvl);
2420 2421  
2421 2422          zone->zone_status = status;
2422 2423  
2423 2424          cv_broadcast(&zone->zone_cv);
2424 2425  }
2425 2426  
2426 2427  /*
2427 2428   * Public function to retrieve the zone status.  The zone status may
2428 2429   * change after it is retrieved.
2429 2430   */
2430 2431  zone_status_t
2431 2432  zone_status_get(zone_t *zone)
2432 2433  {
2433 2434          return (zone->zone_status);
2434 2435  }
2435 2436  
2436 2437  static int
2437 2438  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2438 2439  {
2439 2440          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2440 2441          int err = 0;
2441 2442  
2442 2443          ASSERT(zone != global_zone);
2443 2444          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2444 2445                  goto done;      /* EFAULT or ENAMETOOLONG */
2445 2446  
2446 2447          if (zone->zone_bootargs != NULL)
2447 2448                  strfree(zone->zone_bootargs);
2448 2449  
2449 2450          zone->zone_bootargs = strdup(buf);
2450 2451  
2451 2452  done:
2452 2453          kmem_free(buf, BOOTARGS_MAX);
2453 2454          return (err);
2454 2455  }
2455 2456  
2456 2457  static int
2457 2458  zone_set_brand(zone_t *zone, const char *brand)
2458 2459  {
2459 2460          struct brand_attr *attrp;
2460 2461          brand_t *bp;
2461 2462  
2462 2463          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2463 2464          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2464 2465                  kmem_free(attrp, sizeof (struct brand_attr));
2465 2466                  return (EFAULT);
2466 2467          }
2467 2468  
2468 2469          bp = brand_register_zone(attrp);
2469 2470          kmem_free(attrp, sizeof (struct brand_attr));
2470 2471          if (bp == NULL)
2471 2472                  return (EINVAL);
2472 2473  
2473 2474          /*
2474 2475           * This is the only place where a zone can change it's brand.
2475 2476           * We already need to hold zone_status_lock to check the zone
2476 2477           * status, so we'll just use that lock to serialize zone
2477 2478           * branding requests as well.
2478 2479           */
2479 2480          mutex_enter(&zone_status_lock);
2480 2481  
2481 2482          /* Re-Branding is not allowed and the zone can't be booted yet */
2482 2483          if ((ZONE_IS_BRANDED(zone)) ||
2483 2484              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2484 2485                  mutex_exit(&zone_status_lock);
2485 2486                  brand_unregister_zone(bp);
2486 2487                  return (EINVAL);
2487 2488          }

↓ open down ↓

427 lines elided

↑ open up ↑

2488 2489  
2489 2490          /* set up the brand specific data */
2490 2491          zone->zone_brand = bp;
2491 2492          ZBROP(zone)->b_init_brand_data(zone);
2492 2493  
2493 2494          mutex_exit(&zone_status_lock);
2494 2495          return (0);
2495 2496  }
2496 2497  
2497 2498  static int
     2499 +zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
     2500 +{
     2501 +        int err = 0;
     2502 +        psecflags_t psf;
     2503 +
     2504 +        ASSERT(zone != global_zone);
     2505 +
     2506 +        if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
     2507 +                return (err);
     2508 +
     2509 +        if (zone_status_get(zone) > ZONE_IS_READY)
     2510 +                return (EINVAL);
     2511 +
     2512 +        if (!psecflags_validate(&psf))
     2513 +                return (EINVAL);
     2514 +
     2515 +        (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
     2516 +
     2517 +        /* Set security flags on the zone's zsched */
     2518 +        (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
     2519 +            sizeof (zone->zone_zsched->p_secflags));
     2520 +
     2521 +        return (0);
     2522 +}
     2523 +
     2524 +static int
2498 2525  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2499 2526  {
2500 2527          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2501 2528          int err = 0;
2502 2529  
2503 2530          ASSERT(zone != global_zone);
2504 2531          if ((err = copyinstr(zone_fs_allowed, buf,
2505 2532              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2506 2533                  goto done;
2507 2534

2508 2535          if (zone->zone_fs_allowed != NULL)
2509 2536                  strfree(zone->zone_fs_allowed);
2510 2537  
2511 2538          zone->zone_fs_allowed = strdup(buf);
2512 2539  
2513 2540  done:
2514 2541          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2515 2542          return (err);
2516 2543  }
2517 2544  
2518 2545  static int
2519 2546  zone_set_initname(zone_t *zone, const char *zone_initname)
2520 2547  {
2521 2548          char initname[INITNAME_SZ];
2522 2549          size_t len;
2523 2550          int err = 0;
2524 2551  
2525 2552          ASSERT(zone != global_zone);
2526 2553          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2527 2554                  return (err);   /* EFAULT or ENAMETOOLONG */
2528 2555  
2529 2556          if (zone->zone_initname != NULL)
2530 2557                  strfree(zone->zone_initname);
2531 2558  
2532 2559          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2533 2560          (void) strcpy(zone->zone_initname, initname);
2534 2561          return (0);
2535 2562  }
2536 2563  
2537 2564  static int
2538 2565  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2539 2566  {
2540 2567          uint64_t mcap;
2541 2568          int err = 0;
2542 2569  
2543 2570          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2544 2571                  zone->zone_phys_mcap = mcap;
2545 2572  
2546 2573          return (err);
2547 2574  }
2548 2575  
2549 2576  static int
2550 2577  zone_set_sched_class(zone_t *zone, const char *new_class)
2551 2578  {
2552 2579          char sched_class[PC_CLNMSZ];
2553 2580          id_t classid;
2554 2581          int err;
2555 2582  
2556 2583          ASSERT(zone != global_zone);
2557 2584          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2558 2585                  return (err);   /* EFAULT or ENAMETOOLONG */
2559 2586  
2560 2587          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2561 2588                  return (set_errno(EINVAL));
2562 2589          zone->zone_defaultcid = classid;
2563 2590          ASSERT(zone->zone_defaultcid > 0 &&
2564 2591              zone->zone_defaultcid < loaded_classes);
2565 2592  
2566 2593          return (0);
2567 2594  }
2568 2595  
2569 2596  /*
2570 2597   * Block indefinitely waiting for (zone_status >= status)
2571 2598   */
2572 2599  void
2573 2600  zone_status_wait(zone_t *zone, zone_status_t status)
2574 2601  {
2575 2602          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2576 2603  
2577 2604          mutex_enter(&zone_status_lock);
2578 2605          while (zone->zone_status < status) {
2579 2606                  cv_wait(&zone->zone_cv, &zone_status_lock);
2580 2607          }
2581 2608          mutex_exit(&zone_status_lock);
2582 2609  }
2583 2610  
2584 2611  /*
2585 2612   * Private CPR-safe version of zone_status_wait().
2586 2613   */
2587 2614  static void
2588 2615  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2589 2616  {
2590 2617          callb_cpr_t cprinfo;
2591 2618  
2592 2619          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2593 2620  
2594 2621          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2595 2622              str);
2596 2623          mutex_enter(&zone_status_lock);
2597 2624          while (zone->zone_status < status) {
2598 2625                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2599 2626                  cv_wait(&zone->zone_cv, &zone_status_lock);
2600 2627                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2601 2628          }
2602 2629          /*
2603 2630           * zone_status_lock is implicitly released by the following.
2604 2631           */
2605 2632          CALLB_CPR_EXIT(&cprinfo);
2606 2633  }
2607 2634  
2608 2635  /*
2609 2636   * Block until zone enters requested state or signal is received.  Return (0)
2610 2637   * if signaled, non-zero otherwise.
2611 2638   */
2612 2639  int
2613 2640  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2614 2641  {
2615 2642          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2616 2643  
2617 2644          mutex_enter(&zone_status_lock);
2618 2645          while (zone->zone_status < status) {
2619 2646                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2620 2647                          mutex_exit(&zone_status_lock);
2621 2648                          return (0);
2622 2649                  }
2623 2650          }
2624 2651          mutex_exit(&zone_status_lock);
2625 2652          return (1);
2626 2653  }
2627 2654  
2628 2655  /*
2629 2656   * Block until the zone enters the requested state or the timeout expires,
2630 2657   * whichever happens first.  Return (-1) if operation timed out, time remaining
2631 2658   * otherwise.
2632 2659   */
2633 2660  clock_t
2634 2661  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2635 2662  {
2636 2663          clock_t timeleft = 0;
2637 2664  
2638 2665          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2639 2666  
2640 2667          mutex_enter(&zone_status_lock);
2641 2668          while (zone->zone_status < status && timeleft != -1) {
2642 2669                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2643 2670          }
2644 2671          mutex_exit(&zone_status_lock);
2645 2672          return (timeleft);
2646 2673  }
2647 2674  
2648 2675  /*
2649 2676   * Block until the zone enters the requested state, the current process is
2650 2677   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2651 2678   * operation timed out, 0 if signaled, time remaining otherwise.
2652 2679   */
2653 2680  clock_t
2654 2681  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2655 2682  {
2656 2683          clock_t timeleft = tim - ddi_get_lbolt();
2657 2684  
2658 2685          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2659 2686  
2660 2687          mutex_enter(&zone_status_lock);
2661 2688          while (zone->zone_status < status) {
2662 2689                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2663 2690                      tim);
2664 2691                  if (timeleft <= 0)
2665 2692                          break;
2666 2693          }
2667 2694          mutex_exit(&zone_status_lock);
2668 2695          return (timeleft);
2669 2696  }
2670 2697  
2671 2698  /*
2672 2699   * Zones have two reference counts: one for references from credential
2673 2700   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2674 2701   * This is so we can allow a zone to be rebooted while there are still
2675 2702   * outstanding cred references, since certain drivers cache dblks (which
2676 2703   * implicitly results in cached creds).  We wait for zone_ref to drop to
2677 2704   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2678 2705   * later freed when the zone_cred_ref drops to 0, though nothing other
2679 2706   * than the zone id and privilege set should be accessed once the zone
2680 2707   * is "dead".
2681 2708   *
2682 2709   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2683 2710   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2684 2711   * to 0.  This can be useful to flush out other sources of cached creds
2685 2712   * that may be less innocuous than the driver case.
2686 2713   *
2687 2714   * Zones also provide a tracked reference counting mechanism in which zone
2688 2715   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2689 2716   * debuggers determine the sources of leaked zone references.  See
2690 2717   * zone_hold_ref() and zone_rele_ref() below for more information.
2691 2718   */
2692 2719  
2693 2720  int zone_wait_for_cred = 0;
2694 2721  
2695 2722  static void
2696 2723  zone_hold_locked(zone_t *z)
2697 2724  {
2698 2725          ASSERT(MUTEX_HELD(&z->zone_lock));
2699 2726          z->zone_ref++;
2700 2727          ASSERT(z->zone_ref != 0);
2701 2728  }
2702 2729  
2703 2730  /*
2704 2731   * Increment the specified zone's reference count.  The zone's zone_t structure
2705 2732   * will not be freed as long as the zone's reference count is nonzero.
2706 2733   * Decrement the zone's reference count via zone_rele().
2707 2734   *
2708 2735   * NOTE: This function should only be used to hold zones for short periods of
2709 2736   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2710 2737   */
2711 2738  void
2712 2739  zone_hold(zone_t *z)
2713 2740  {
2714 2741          mutex_enter(&z->zone_lock);
2715 2742          zone_hold_locked(z);
2716 2743          mutex_exit(&z->zone_lock);
2717 2744  }
2718 2745  
2719 2746  /*
2720 2747   * If the non-cred ref count drops to 1 and either the cred ref count
2721 2748   * is 0 or we aren't waiting for cred references, the zone is ready to
2722 2749   * be destroyed.
2723 2750   */
2724 2751  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2725 2752              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2726 2753  
2727 2754  /*
2728 2755   * Common zone reference release function invoked by zone_rele() and
2729 2756   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2730 2757   * zone's subsystem-specific reference counters are not affected by the
2731 2758   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2732 2759   * removed from the specified zone's reference list.  ref must be non-NULL iff
2733 2760   * subsys is not ZONE_REF_NUM_SUBSYS.
2734 2761   */
2735 2762  static void
2736 2763  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2737 2764  {
2738 2765          boolean_t wakeup;
2739 2766  
2740 2767          mutex_enter(&z->zone_lock);
2741 2768          ASSERT(z->zone_ref != 0);
2742 2769          z->zone_ref--;
2743 2770          if (subsys != ZONE_REF_NUM_SUBSYS) {
2744 2771                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2745 2772                  z->zone_subsys_ref[subsys]--;
2746 2773                  list_remove(&z->zone_ref_list, ref);
2747 2774          }
2748 2775          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2749 2776                  /* no more refs, free the structure */
2750 2777                  mutex_exit(&z->zone_lock);
2751 2778                  zone_free(z);
2752 2779                  return;
2753 2780          }
2754 2781          /* signal zone_destroy so the zone can finish halting */
2755 2782          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2756 2783          mutex_exit(&z->zone_lock);
2757 2784  
2758 2785          if (wakeup) {
2759 2786                  /*
2760 2787                   * Grabbing zonehash_lock here effectively synchronizes with
2761 2788                   * zone_destroy() to avoid missed signals.
2762 2789                   */
2763 2790                  mutex_enter(&zonehash_lock);
2764 2791                  cv_broadcast(&zone_destroy_cv);
2765 2792                  mutex_exit(&zonehash_lock);
2766 2793          }
2767 2794  }
2768 2795  
2769 2796  /*
2770 2797   * Decrement the specified zone's reference count.  The specified zone will
2771 2798   * cease to exist after this function returns if the reference count drops to
2772 2799   * zero.  This function should be paired with zone_hold().
2773 2800   */
2774 2801  void
2775 2802  zone_rele(zone_t *z)
2776 2803  {
2777 2804          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2778 2805  }
2779 2806  
2780 2807  /*
2781 2808   * Initialize a zone reference structure.  This function must be invoked for
2782 2809   * a reference structure before the structure is passed to zone_hold_ref().
2783 2810   */
2784 2811  void
2785 2812  zone_init_ref(zone_ref_t *ref)
2786 2813  {
2787 2814          ref->zref_zone = NULL;
2788 2815          list_link_init(&ref->zref_linkage);
2789 2816  }
2790 2817  
2791 2818  /*
2792 2819   * Acquire a reference to zone z.  The caller must specify the
2793 2820   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2794 2821   * zone_ref_t structure will represent a reference to the specified zone.  Use
2795 2822   * zone_rele_ref() to release the reference.
2796 2823   *
2797 2824   * The referenced zone_t structure will not be freed as long as the zone_t's
2798 2825   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2799 2826   * references.
2800 2827   *
2801 2828   * NOTE: The zone_ref_t structure must be initialized before it is used.
2802 2829   * See zone_init_ref() above.
2803 2830   */
2804 2831  void
2805 2832  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2806 2833  {
2807 2834          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2808 2835  
2809 2836          /*
2810 2837           * Prevent consumers from reusing a reference structure before
2811 2838           * releasing it.
2812 2839           */
2813 2840          VERIFY(ref->zref_zone == NULL);
2814 2841  
2815 2842          ref->zref_zone = z;
2816 2843          mutex_enter(&z->zone_lock);
2817 2844          zone_hold_locked(z);
2818 2845          z->zone_subsys_ref[subsys]++;
2819 2846          ASSERT(z->zone_subsys_ref[subsys] != 0);
2820 2847          list_insert_head(&z->zone_ref_list, ref);
2821 2848          mutex_exit(&z->zone_lock);
2822 2849  }
2823 2850  
2824 2851  /*
2825 2852   * Release the zone reference represented by the specified zone_ref_t.
2826 2853   * The reference is invalid after it's released; however, the zone_ref_t
2827 2854   * structure can be reused without having to invoke zone_init_ref().
2828 2855   * subsys should be the same value that was passed to zone_hold_ref()
2829 2856   * when the reference was acquired.
2830 2857   */
2831 2858  void
2832 2859  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2833 2860  {
2834 2861          zone_rele_common(ref->zref_zone, ref, subsys);
2835 2862  
2836 2863          /*
2837 2864           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2838 2865           * when consumers dereference the reference.  This helps us catch
2839 2866           * consumers who use released references.  Furthermore, this lets
2840 2867           * consumers reuse the zone_ref_t structure without having to
2841 2868           * invoke zone_init_ref().
2842 2869           */
2843 2870          ref->zref_zone = NULL;
2844 2871  }
2845 2872  
2846 2873  void
2847 2874  zone_cred_hold(zone_t *z)
2848 2875  {
2849 2876          mutex_enter(&z->zone_lock);
2850 2877          z->zone_cred_ref++;
2851 2878          ASSERT(z->zone_cred_ref != 0);
2852 2879          mutex_exit(&z->zone_lock);
2853 2880  }
2854 2881  
2855 2882  void
2856 2883  zone_cred_rele(zone_t *z)
2857 2884  {
2858 2885          boolean_t wakeup;
2859 2886  
2860 2887          mutex_enter(&z->zone_lock);
2861 2888          ASSERT(z->zone_cred_ref != 0);
2862 2889          z->zone_cred_ref--;
2863 2890          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2864 2891                  /* no more refs, free the structure */
2865 2892                  mutex_exit(&z->zone_lock);
2866 2893                  zone_free(z);
2867 2894                  return;
2868 2895          }
2869 2896          /*
2870 2897           * If zone_destroy is waiting for the cred references to drain
2871 2898           * out, and they have, signal it.
2872 2899           */
2873 2900          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2874 2901              zone_status_get(z) >= ZONE_IS_DEAD);
2875 2902          mutex_exit(&z->zone_lock);
2876 2903  
2877 2904          if (wakeup) {
2878 2905                  /*
2879 2906                   * Grabbing zonehash_lock here effectively synchronizes with
2880 2907                   * zone_destroy() to avoid missed signals.
2881 2908                   */
2882 2909                  mutex_enter(&zonehash_lock);
2883 2910                  cv_broadcast(&zone_destroy_cv);
2884 2911                  mutex_exit(&zonehash_lock);
2885 2912          }
2886 2913  }
2887 2914  
2888 2915  void
2889 2916  zone_task_hold(zone_t *z)
2890 2917  {
2891 2918          mutex_enter(&z->zone_lock);
2892 2919          z->zone_ntasks++;
2893 2920          ASSERT(z->zone_ntasks != 0);
2894 2921          mutex_exit(&z->zone_lock);
2895 2922  }
2896 2923  
2897 2924  void
2898 2925  zone_task_rele(zone_t *zone)
2899 2926  {
2900 2927          uint_t refcnt;
2901 2928  
2902 2929          mutex_enter(&zone->zone_lock);
2903 2930          ASSERT(zone->zone_ntasks != 0);
2904 2931          refcnt = --zone->zone_ntasks;
2905 2932          if (refcnt > 1) {       /* Common case */
2906 2933                  mutex_exit(&zone->zone_lock);
2907 2934                  return;
2908 2935          }
2909 2936          zone_hold_locked(zone); /* so we can use the zone_t later */
2910 2937          mutex_exit(&zone->zone_lock);
2911 2938          if (refcnt == 1) {
2912 2939                  /*
2913 2940                   * See if the zone is shutting down.
2914 2941                   */
2915 2942                  mutex_enter(&zone_status_lock);
2916 2943                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2917 2944                          goto out;
2918 2945                  }
2919 2946  
2920 2947                  /*
2921 2948                   * Make sure the ntasks didn't change since we
2922 2949                   * dropped zone_lock.
2923 2950                   */
2924 2951                  mutex_enter(&zone->zone_lock);
2925 2952                  if (refcnt != zone->zone_ntasks) {
2926 2953                          mutex_exit(&zone->zone_lock);
2927 2954                          goto out;
2928 2955                  }
2929 2956                  mutex_exit(&zone->zone_lock);
2930 2957  
2931 2958                  /*
2932 2959                   * No more user processes in the zone.  The zone is empty.
2933 2960                   */
2934 2961                  zone_status_set(zone, ZONE_IS_EMPTY);
2935 2962                  goto out;
2936 2963          }
2937 2964  
2938 2965          ASSERT(refcnt == 0);
2939 2966          /*
2940 2967           * zsched has exited; the zone is dead.
2941 2968           */
2942 2969          zone->zone_zsched = NULL;               /* paranoia */
2943 2970          mutex_enter(&zone_status_lock);
2944 2971          zone_status_set(zone, ZONE_IS_DEAD);
2945 2972  out:
2946 2973          mutex_exit(&zone_status_lock);
2947 2974          zone_rele(zone);
2948 2975  }
2949 2976  
2950 2977  zoneid_t
2951 2978  getzoneid(void)
2952 2979  {
2953 2980          return (curproc->p_zone->zone_id);
2954 2981  }
2955 2982  
2956 2983  /*
2957 2984   * Internal versions of zone_find_by_*().  These don't zone_hold() or
2958 2985   * check the validity of a zone's state.
2959 2986   */
2960 2987  static zone_t *
2961 2988  zone_find_all_by_id(zoneid_t zoneid)
2962 2989  {
2963 2990          mod_hash_val_t hv;
2964 2991          zone_t *zone = NULL;
2965 2992  
2966 2993          ASSERT(MUTEX_HELD(&zonehash_lock));
2967 2994  
2968 2995          if (mod_hash_find(zonehashbyid,
2969 2996              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2970 2997                  zone = (zone_t *)hv;
2971 2998          return (zone);
2972 2999  }
2973 3000  
2974 3001  static zone_t *
2975 3002  zone_find_all_by_label(const ts_label_t *label)
2976 3003  {
2977 3004          mod_hash_val_t hv;
2978 3005          zone_t *zone = NULL;
2979 3006  
2980 3007          ASSERT(MUTEX_HELD(&zonehash_lock));
2981 3008  
2982 3009          /*
2983 3010           * zonehashbylabel is not maintained for unlabeled systems
2984 3011           */
2985 3012          if (!is_system_labeled())
2986 3013                  return (NULL);
2987 3014          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2988 3015                  zone = (zone_t *)hv;
2989 3016          return (zone);
2990 3017  }
2991 3018  
2992 3019  static zone_t *
2993 3020  zone_find_all_by_name(char *name)
2994 3021  {
2995 3022          mod_hash_val_t hv;
2996 3023          zone_t *zone = NULL;
2997 3024  
2998 3025          ASSERT(MUTEX_HELD(&zonehash_lock));
2999 3026  
3000 3027          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3001 3028                  zone = (zone_t *)hv;
3002 3029          return (zone);
3003 3030  }
3004 3031  
3005 3032  /*
3006 3033   * Public interface for looking up a zone by zoneid.  Only returns the zone if
3007 3034   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3008 3035   * Caller must call zone_rele() once it is done with the zone.
3009 3036   *
3010 3037   * The zone may begin the zone_destroy() sequence immediately after this
3011 3038   * function returns, but may be safely used until zone_rele() is called.
3012 3039   */
3013 3040  zone_t *
3014 3041  zone_find_by_id(zoneid_t zoneid)
3015 3042  {
3016 3043          zone_t *zone;
3017 3044          zone_status_t status;
3018 3045  
3019 3046          mutex_enter(&zonehash_lock);
3020 3047          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3021 3048                  mutex_exit(&zonehash_lock);
3022 3049                  return (NULL);
3023 3050          }
3024 3051          status = zone_status_get(zone);
3025 3052          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3026 3053                  /*
3027 3054                   * For all practical purposes the zone doesn't exist.
3028 3055                   */
3029 3056                  mutex_exit(&zonehash_lock);
3030 3057                  return (NULL);
3031 3058          }
3032 3059          zone_hold(zone);
3033 3060          mutex_exit(&zonehash_lock);
3034 3061          return (zone);
3035 3062  }
3036 3063  
3037 3064  /*
3038 3065   * Similar to zone_find_by_id, but using zone label as the key.
3039 3066   */
3040 3067  zone_t *
3041 3068  zone_find_by_label(const ts_label_t *label)
3042 3069  {
3043 3070          zone_t *zone;
3044 3071          zone_status_t status;
3045 3072  
3046 3073          mutex_enter(&zonehash_lock);
3047 3074          if ((zone = zone_find_all_by_label(label)) == NULL) {
3048 3075                  mutex_exit(&zonehash_lock);
3049 3076                  return (NULL);
3050 3077          }
3051 3078  
3052 3079          status = zone_status_get(zone);
3053 3080          if (status > ZONE_IS_DOWN) {
3054 3081                  /*
3055 3082                   * For all practical purposes the zone doesn't exist.
3056 3083                   */
3057 3084                  mutex_exit(&zonehash_lock);
3058 3085                  return (NULL);
3059 3086          }
3060 3087          zone_hold(zone);
3061 3088          mutex_exit(&zonehash_lock);
3062 3089          return (zone);
3063 3090  }
3064 3091  
3065 3092  /*
3066 3093   * Similar to zone_find_by_id, but using zone name as the key.
3067 3094   */
3068 3095  zone_t *
3069 3096  zone_find_by_name(char *name)
3070 3097  {
3071 3098          zone_t *zone;
3072 3099          zone_status_t status;
3073 3100  
3074 3101          mutex_enter(&zonehash_lock);
3075 3102          if ((zone = zone_find_all_by_name(name)) == NULL) {
3076 3103                  mutex_exit(&zonehash_lock);
3077 3104                  return (NULL);
3078 3105          }
3079 3106          status = zone_status_get(zone);
3080 3107          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3081 3108                  /*
3082 3109                   * For all practical purposes the zone doesn't exist.
3083 3110                   */
3084 3111                  mutex_exit(&zonehash_lock);
3085 3112                  return (NULL);
3086 3113          }
3087 3114          zone_hold(zone);
3088 3115          mutex_exit(&zonehash_lock);
3089 3116          return (zone);
3090 3117  }
3091 3118  
3092 3119  /*
3093 3120   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3094 3121   * if there is a zone "foo" rooted at /foo/root, and the path argument
3095 3122   * is "/foo/root/proc", it will return the held zone_t corresponding to
3096 3123   * zone "foo".
3097 3124   *
3098 3125   * zone_find_by_path() always returns a non-NULL value, since at the
3099 3126   * very least every path will be contained in the global zone.
3100 3127   *
3101 3128   * As with the other zone_find_by_*() functions, the caller is
3102 3129   * responsible for zone_rele()ing the return value of this function.
3103 3130   */
3104 3131  zone_t *
3105 3132  zone_find_by_path(const char *path)
3106 3133  {
3107 3134          zone_t *zone;
3108 3135          zone_t *zret = NULL;
3109 3136          zone_status_t status;
3110 3137  
3111 3138          if (path == NULL) {
3112 3139                  /*
3113 3140                   * Call from rootconf().
3114 3141                   */
3115 3142                  zone_hold(global_zone);
3116 3143                  return (global_zone);
3117 3144          }
3118 3145          ASSERT(*path == '/');
3119 3146          mutex_enter(&zonehash_lock);
3120 3147          for (zone = list_head(&zone_active); zone != NULL;
3121 3148              zone = list_next(&zone_active, zone)) {
3122 3149                  if (ZONE_PATH_VISIBLE(path, zone))
3123 3150                          zret = zone;
3124 3151          }
3125 3152          ASSERT(zret != NULL);
3126 3153          status = zone_status_get(zret);
3127 3154          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3128 3155                  /*
3129 3156                   * Zone practically doesn't exist.
3130 3157                   */
3131 3158                  zret = global_zone;
3132 3159          }
3133 3160          zone_hold(zret);
3134 3161          mutex_exit(&zonehash_lock);
3135 3162          return (zret);
3136 3163  }
3137 3164  
3138 3165  /*
3139 3166   * Public interface for updating per-zone load averages.  Called once per
3140 3167   * second.
3141 3168   *
3142 3169   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3143 3170   */
3144 3171  void
3145 3172  zone_loadavg_update()
3146 3173  {
3147 3174          zone_t *zp;
3148 3175          zone_status_t status;
3149 3176          struct loadavg_s *lavg;
3150 3177          hrtime_t zone_total;
3151 3178          int i;
3152 3179          hrtime_t hr_avg;
3153 3180          int nrun;
3154 3181          static int64_t f[3] = { 135, 27, 9 };
3155 3182          int64_t q, r;
3156 3183  
3157 3184          mutex_enter(&zonehash_lock);
3158 3185          for (zp = list_head(&zone_active); zp != NULL;
3159 3186              zp = list_next(&zone_active, zp)) {
3160 3187                  mutex_enter(&zp->zone_lock);
3161 3188  
3162 3189                  /* Skip zones that are on the way down or not yet up */
3163 3190                  status = zone_status_get(zp);
3164 3191                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3165 3192                          /* For all practical purposes the zone doesn't exist. */
3166 3193                          mutex_exit(&zp->zone_lock);
3167 3194                          continue;
3168 3195                  }
3169 3196  
3170 3197                  /*
3171 3198                   * Update the 10 second moving average data in zone_loadavg.
3172 3199                   */
3173 3200                  lavg = &zp->zone_loadavg;
3174 3201  
3175 3202                  zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3176 3203                  scalehrtime(&zone_total);
3177 3204  
3178 3205                  /* The zone_total should always be increasing. */
3179 3206                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3180 3207                      zone_total - lavg->lg_total : 0;
3181 3208                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3182 3209                  /* lg_total holds the prev. 1 sec. total */
3183 3210                  lavg->lg_total = zone_total;
3184 3211  
3185 3212                  /*
3186 3213                   * To simplify the calculation, we don't calculate the load avg.
3187 3214                   * until the zone has been up for at least 10 seconds and our
3188 3215                   * moving average is thus full.
3189 3216                   */
3190 3217                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3191 3218                          lavg->lg_len++;
3192 3219                          mutex_exit(&zp->zone_lock);
3193 3220                          continue;
3194 3221                  }
3195 3222  
3196 3223                  /* Now calculate the 1min, 5min, 15 min load avg. */
3197 3224                  hr_avg = 0;
3198 3225                  for (i = 0; i < S_LOADAVG_SZ; i++)
3199 3226                          hr_avg += lavg->lg_loads[i];
3200 3227                  hr_avg = hr_avg / S_LOADAVG_SZ;
3201 3228                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3202 3229  
3203 3230                  /* Compute load avg. See comment in calcloadavg() */
3204 3231                  for (i = 0; i < 3; i++) {
3205 3232                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3206 3233                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3207 3234                          zp->zone_hp_avenrun[i] +=
3208 3235                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3209 3236  
3210 3237                          /* avenrun[] can only hold 31 bits of load avg. */
3211 3238                          if (zp->zone_hp_avenrun[i] <
3212 3239                              ((uint64_t)1<<(31+16-FSHIFT)))
3213 3240                                  zp->zone_avenrun[i] = (int32_t)
3214 3241                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3215 3242                          else
3216 3243                                  zp->zone_avenrun[i] = 0x7fffffff;
3217 3244                  }
3218 3245  
3219 3246                  mutex_exit(&zp->zone_lock);
3220 3247          }
3221 3248          mutex_exit(&zonehash_lock);
3222 3249  }
3223 3250  
3224 3251  /*
3225 3252   * Get the number of cpus visible to this zone.  The system-wide global
3226 3253   * 'ncpus' is returned if pools are disabled, the caller is in the
3227 3254   * global zone, or a NULL zone argument is passed in.
3228 3255   */
3229 3256  int
3230 3257  zone_ncpus_get(zone_t *zone)
3231 3258  {
3232 3259          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3233 3260  
3234 3261          return (myncpus != 0 ? myncpus : ncpus);
3235 3262  }
3236 3263  
3237 3264  /*
3238 3265   * Get the number of online cpus visible to this zone.  The system-wide
3239 3266   * global 'ncpus_online' is returned if pools are disabled, the caller
3240 3267   * is in the global zone, or a NULL zone argument is passed in.
3241 3268   */
3242 3269  int
3243 3270  zone_ncpus_online_get(zone_t *zone)
3244 3271  {
3245 3272          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3246 3273  
3247 3274          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3248 3275  }
3249 3276  
3250 3277  /*
3251 3278   * Return the pool to which the zone is currently bound.
3252 3279   */
3253 3280  pool_t *
3254 3281  zone_pool_get(zone_t *zone)
3255 3282  {
3256 3283          ASSERT(pool_lock_held());
3257 3284  
3258 3285          return (zone->zone_pool);
3259 3286  }
3260 3287  
3261 3288  /*
3262 3289   * Set the zone's pool pointer and update the zone's visibility to match
3263 3290   * the resources in the new pool.
3264 3291   */
3265 3292  void
3266 3293  zone_pool_set(zone_t *zone, pool_t *pool)
3267 3294  {
3268 3295          ASSERT(pool_lock_held());
3269 3296          ASSERT(MUTEX_HELD(&cpu_lock));
3270 3297  
3271 3298          zone->zone_pool = pool;
3272 3299          zone_pset_set(zone, pool->pool_pset->pset_id);
3273 3300  }
3274 3301  
3275 3302  /*
3276 3303   * Return the cached value of the id of the processor set to which the
3277 3304   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3278 3305   * facility is disabled.
3279 3306   */
3280 3307  psetid_t
3281 3308  zone_pset_get(zone_t *zone)
3282 3309  {
3283 3310          ASSERT(MUTEX_HELD(&cpu_lock));
3284 3311  
3285 3312          return (zone->zone_psetid);
3286 3313  }
3287 3314  
3288 3315  /*
3289 3316   * Set the cached value of the id of the processor set to which the zone
3290 3317   * is currently bound.  Also update the zone's visibility to match the
3291 3318   * resources in the new processor set.
3292 3319   */
3293 3320  void
3294 3321  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3295 3322  {
3296 3323          psetid_t oldpsetid;
3297 3324  
3298 3325          ASSERT(MUTEX_HELD(&cpu_lock));
3299 3326          oldpsetid = zone_pset_get(zone);
3300 3327  
3301 3328          if (oldpsetid == newpsetid)
3302 3329                  return;
3303 3330          /*
3304 3331           * Global zone sees all.
3305 3332           */
3306 3333          if (zone != global_zone) {
3307 3334                  zone->zone_psetid = newpsetid;
3308 3335                  if (newpsetid != ZONE_PS_INVAL)
3309 3336                          pool_pset_visibility_add(newpsetid, zone);
3310 3337                  if (oldpsetid != ZONE_PS_INVAL)
3311 3338                          pool_pset_visibility_remove(oldpsetid, zone);
3312 3339          }
3313 3340          /*
3314 3341           * Disabling pools, so we should start using the global values
3315 3342           * for ncpus and ncpus_online.
3316 3343           */
3317 3344          if (newpsetid == ZONE_PS_INVAL) {
3318 3345                  zone->zone_ncpus = 0;
3319 3346                  zone->zone_ncpus_online = 0;
3320 3347          }
3321 3348  }
3322 3349  
3323 3350  /*
3324 3351   * Walk the list of active zones and issue the provided callback for
3325 3352   * each of them.
3326 3353   *
3327 3354   * Caller must not be holding any locks that may be acquired under
3328 3355   * zonehash_lock.  See comment at the beginning of the file for a list of
3329 3356   * common locks and their interactions with zones.
3330 3357   */
3331 3358  int
3332 3359  zone_walk(int (*cb)(zone_t *, void *), void *data)
3333 3360  {
3334 3361          zone_t *zone;
3335 3362          int ret = 0;
3336 3363          zone_status_t status;
3337 3364  
3338 3365          mutex_enter(&zonehash_lock);
3339 3366          for (zone = list_head(&zone_active); zone != NULL;
3340 3367              zone = list_next(&zone_active, zone)) {
3341 3368                  /*
3342 3369                   * Skip zones that shouldn't be externally visible.
3343 3370                   */
3344 3371                  status = zone_status_get(zone);
3345 3372                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3346 3373                          continue;
3347 3374                  /*
3348 3375                   * Bail immediately if any callback invocation returns a
3349 3376                   * non-zero value.
3350 3377                   */
3351 3378                  ret = (*cb)(zone, data);
3352 3379                  if (ret != 0)
3353 3380                          break;
3354 3381          }
3355 3382          mutex_exit(&zonehash_lock);
3356 3383          return (ret);
3357 3384  }
3358 3385  
3359 3386  static int
3360 3387  zone_set_root(zone_t *zone, const char *upath)
3361 3388  {
3362 3389          vnode_t *vp;
3363 3390          int trycount;
3364 3391          int error = 0;
3365 3392          char *path;
3366 3393          struct pathname upn, pn;
3367 3394          size_t pathlen;
3368 3395  
3369 3396          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3370 3397                  return (error);
3371 3398  
3372 3399          pn_alloc(&pn);
3373 3400  
3374 3401          /* prevent infinite loop */
3375 3402          trycount = 10;
3376 3403          for (;;) {
3377 3404                  if (--trycount <= 0) {
3378 3405                          error = ESTALE;
3379 3406                          goto out;
3380 3407                  }
3381 3408  
3382 3409                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3383 3410                          /*
3384 3411                           * VOP_ACCESS() may cover 'vp' with a new
3385 3412                           * filesystem, if 'vp' is an autoFS vnode.
3386 3413                           * Get the new 'vp' if so.
3387 3414                           */
3388 3415                          if ((error =
3389 3416                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3390 3417                              (!vn_ismntpt(vp) ||
3391 3418                              (error = traverse(&vp)) == 0)) {
3392 3419                                  pathlen = pn.pn_pathlen + 2;
3393 3420                                  path = kmem_alloc(pathlen, KM_SLEEP);
3394 3421                                  (void) strncpy(path, pn.pn_path,
3395 3422                                      pn.pn_pathlen + 1);
3396 3423                                  path[pathlen - 2] = '/';
3397 3424                                  path[pathlen - 1] = '\0';
3398 3425                                  pn_free(&pn);
3399 3426                                  pn_free(&upn);
3400 3427  
3401 3428                                  /* Success! */
3402 3429                                  break;
3403 3430                          }
3404 3431                          VN_RELE(vp);
3405 3432                  }
3406 3433                  if (error != ESTALE)
3407 3434                          goto out;
3408 3435          }
3409 3436  
3410 3437          ASSERT(error == 0);
3411 3438          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3412 3439          zone->zone_rootpath = path;
3413 3440          zone->zone_rootpathlen = pathlen;
3414 3441          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3415 3442                  zone->zone_flags |= ZF_IS_SCRATCH;
3416 3443          return (0);
3417 3444  
3418 3445  out:
3419 3446          pn_free(&pn);
3420 3447          pn_free(&upn);
3421 3448          return (error);
3422 3449  }
3423 3450  
3424 3451  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3425 3452                          ((c) >= 'a' && (c) <= 'z') || \
3426 3453                          ((c) >= 'A' && (c) <= 'Z'))
3427 3454  
3428 3455  static int
3429 3456  zone_set_name(zone_t *zone, const char *uname)
3430 3457  {
3431 3458          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3432 3459          size_t len;
3433 3460          int i, err;
3434 3461  
3435 3462          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3436 3463                  kmem_free(kname, ZONENAME_MAX);
3437 3464                  return (err);   /* EFAULT or ENAMETOOLONG */
3438 3465          }
3439 3466  
3440 3467          /* must be less than ZONENAME_MAX */
3441 3468          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3442 3469                  kmem_free(kname, ZONENAME_MAX);
3443 3470                  return (EINVAL);
3444 3471          }
3445 3472  
3446 3473          /*
3447 3474           * Name must start with an alphanumeric and must contain only
3448 3475           * alphanumerics, '-', '_' and '.'.
3449 3476           */
3450 3477          if (!isalnum(kname[0])) {
3451 3478                  kmem_free(kname, ZONENAME_MAX);
3452 3479                  return (EINVAL);
3453 3480          }
3454 3481          for (i = 1; i < len - 1; i++) {
3455 3482                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3456 3483                      kname[i] != '.') {
3457 3484                          kmem_free(kname, ZONENAME_MAX);
3458 3485                          return (EINVAL);
3459 3486                  }
3460 3487          }
3461 3488  
3462 3489          zone->zone_name = kname;
3463 3490          return (0);
3464 3491  }
3465 3492  
3466 3493  /*
3467 3494   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3468 3495   * is NULL or it points to a zone with no hostid emulation, then the machine's
3469 3496   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3470 3497   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3471 3498   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3472 3499   * hostid and the machine's hostid is invalid.
3473 3500   */
3474 3501  uint32_t
3475 3502  zone_get_hostid(zone_t *zonep)
3476 3503  {
3477 3504          unsigned long machine_hostid;
3478 3505  
3479 3506          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3480 3507                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3481 3508                          return (HW_INVALID_HOSTID);
3482 3509                  return ((uint32_t)machine_hostid);
3483 3510          }
3484 3511          return (zonep->zone_hostid);
3485 3512  }
3486 3513  
3487 3514  /*
3488 3515   * Similar to thread_create(), but makes sure the thread is in the appropriate
3489 3516   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3490 3517   */
3491 3518  /*ARGSUSED*/
3492 3519  kthread_t *
3493 3520  zthread_create(
3494 3521      caddr_t stk,
3495 3522      size_t stksize,
3496 3523      void (*proc)(),
3497 3524      void *arg,
3498 3525      size_t len,
3499 3526      pri_t pri)
3500 3527  {
3501 3528          kthread_t *t;
3502 3529          zone_t *zone = curproc->p_zone;
3503 3530          proc_t *pp = zone->zone_zsched;
3504 3531  
3505 3532          zone_hold(zone);        /* Reference to be dropped when thread exits */
3506 3533  
3507 3534          /*
3508 3535           * No-one should be trying to create threads if the zone is shutting
3509 3536           * down and there aren't any kernel threads around.  See comment
3510 3537           * in zthread_exit().
3511 3538           */
3512 3539          ASSERT(!(zone->zone_kthreads == NULL &&
3513 3540              zone_status_get(zone) >= ZONE_IS_EMPTY));
3514 3541          /*
3515 3542           * Create a thread, but don't let it run until we've finished setting
3516 3543           * things up.
3517 3544           */
3518 3545          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3519 3546          ASSERT(t->t_forw == NULL);
3520 3547          mutex_enter(&zone_status_lock);
3521 3548          if (zone->zone_kthreads == NULL) {
3522 3549                  t->t_forw = t->t_back = t;
3523 3550          } else {
3524 3551                  kthread_t *tx = zone->zone_kthreads;
3525 3552  
3526 3553                  t->t_forw = tx;
3527 3554                  t->t_back = tx->t_back;
3528 3555                  tx->t_back->t_forw = t;
3529 3556                  tx->t_back = t;
3530 3557          }
3531 3558          zone->zone_kthreads = t;
3532 3559          mutex_exit(&zone_status_lock);
3533 3560  
3534 3561          mutex_enter(&pp->p_lock);
3535 3562          t->t_proc_flag |= TP_ZTHREAD;
3536 3563          project_rele(t->t_proj);
3537 3564          t->t_proj = project_hold(pp->p_task->tk_proj);
3538 3565  
3539 3566          /*
3540 3567           * Setup complete, let it run.
3541 3568           */
3542 3569          thread_lock(t);
3543 3570          t->t_schedflag |= TS_ALLSTART;
3544 3571          setrun_locked(t);
3545 3572          thread_unlock(t);
3546 3573  
3547 3574          mutex_exit(&pp->p_lock);
3548 3575  
3549 3576          return (t);
3550 3577  }
3551 3578  
3552 3579  /*
3553 3580   * Similar to thread_exit().  Must be called by threads created via
3554 3581   * zthread_exit().
3555 3582   */
3556 3583  void
3557 3584  zthread_exit(void)
3558 3585  {
3559 3586          kthread_t *t = curthread;
3560 3587          proc_t *pp = curproc;
3561 3588          zone_t *zone = pp->p_zone;
3562 3589  
3563 3590          mutex_enter(&zone_status_lock);
3564 3591  
3565 3592          /*
3566 3593           * Reparent to p0
3567 3594           */
3568 3595          kpreempt_disable();
3569 3596          mutex_enter(&pp->p_lock);
3570 3597          t->t_proc_flag &= ~TP_ZTHREAD;
3571 3598          t->t_procp = &p0;
3572 3599          hat_thread_exit(t);
3573 3600          mutex_exit(&pp->p_lock);
3574 3601          kpreempt_enable();
3575 3602  
3576 3603          if (t->t_back == t) {
3577 3604                  ASSERT(t->t_forw == t);
3578 3605                  /*
3579 3606                   * If the zone is empty, once the thread count
3580 3607                   * goes to zero no further kernel threads can be
3581 3608                   * created.  This is because if the creator is a process
3582 3609                   * in the zone, then it must have exited before the zone
3583 3610                   * state could be set to ZONE_IS_EMPTY.
3584 3611                   * Otherwise, if the creator is a kernel thread in the
3585 3612                   * zone, the thread count is non-zero.
3586 3613                   *
3587 3614                   * This really means that non-zone kernel threads should
3588 3615                   * not create zone kernel threads.
3589 3616                   */
3590 3617                  zone->zone_kthreads = NULL;
3591 3618                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3592 3619                          zone_status_set(zone, ZONE_IS_DOWN);
3593 3620                          /*
3594 3621                           * Remove any CPU caps on this zone.
3595 3622                           */
3596 3623                          cpucaps_zone_remove(zone);
3597 3624                  }
3598 3625          } else {
3599 3626                  t->t_forw->t_back = t->t_back;
3600 3627                  t->t_back->t_forw = t->t_forw;
3601 3628                  if (zone->zone_kthreads == t)
3602 3629                          zone->zone_kthreads = t->t_forw;
3603 3630          }
3604 3631          mutex_exit(&zone_status_lock);
3605 3632          zone_rele(zone);
3606 3633          thread_exit();
3607 3634          /* NOTREACHED */
3608 3635  }
3609 3636  
3610 3637  static void
3611 3638  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3612 3639  {
3613 3640          vnode_t *oldvp;
3614 3641  
3615 3642          /* we're going to hold a reference here to the directory */
3616 3643          VN_HOLD(vp);
3617 3644  
3618 3645          /* update abs cwd/root path see c2/audit.c */
3619 3646          if (AU_AUDITING())
3620 3647                  audit_chdirec(vp, vpp);
3621 3648  
3622 3649          mutex_enter(&pp->p_lock);
3623 3650          oldvp = *vpp;
3624 3651          *vpp = vp;
3625 3652          mutex_exit(&pp->p_lock);
3626 3653          if (oldvp != NULL)
3627 3654                  VN_RELE(oldvp);
3628 3655  }
3629 3656  
3630 3657  /*
3631 3658   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3632 3659   */
3633 3660  static int
3634 3661  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3635 3662  {
3636 3663          nvpair_t *nvp = NULL;
3637 3664          boolean_t priv_set = B_FALSE;
3638 3665          boolean_t limit_set = B_FALSE;
3639 3666          boolean_t action_set = B_FALSE;
3640 3667  
3641 3668          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3642 3669                  const char *name;
3643 3670                  uint64_t ui64;
3644 3671  
3645 3672                  name = nvpair_name(nvp);
3646 3673                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3647 3674                          return (EINVAL);
3648 3675                  (void) nvpair_value_uint64(nvp, &ui64);
3649 3676                  if (strcmp(name, "privilege") == 0) {
3650 3677                          /*
3651 3678                           * Currently only privileged values are allowed, but
3652 3679                           * this may change in the future.
3653 3680                           */
3654 3681                          if (ui64 != RCPRIV_PRIVILEGED)
3655 3682                                  return (EINVAL);
3656 3683                          rv->rcv_privilege = ui64;
3657 3684                          priv_set = B_TRUE;
3658 3685                  } else if (strcmp(name, "limit") == 0) {
3659 3686                          rv->rcv_value = ui64;
3660 3687                          limit_set = B_TRUE;
3661 3688                  } else if (strcmp(name, "action") == 0) {
3662 3689                          if (ui64 != RCTL_LOCAL_NOACTION &&
3663 3690                              ui64 != RCTL_LOCAL_DENY)
3664 3691                                  return (EINVAL);
3665 3692                          rv->rcv_flagaction = ui64;
3666 3693                          action_set = B_TRUE;
3667 3694                  } else {
3668 3695                          return (EINVAL);
3669 3696                  }
3670 3697          }
3671 3698  
3672 3699          if (!(priv_set && limit_set && action_set))
3673 3700                  return (EINVAL);
3674 3701          rv->rcv_action_signal = 0;
3675 3702          rv->rcv_action_recipient = NULL;
3676 3703          rv->rcv_action_recip_pid = -1;
3677 3704          rv->rcv_firing_time = 0;
3678 3705  
3679 3706          return (0);
3680 3707  }
3681 3708  
3682 3709  /*
3683 3710   * Non-global zone version of start_init.
3684 3711   */
3685 3712  void
3686 3713  zone_start_init(void)
3687 3714  {
3688 3715          proc_t *p = ttoproc(curthread);
3689 3716          zone_t *z = p->p_zone;
3690 3717  
3691 3718          ASSERT(!INGLOBALZONE(curproc));
3692 3719  
3693 3720          /*
3694 3721           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3695 3722           * storing just the pid of init is sufficient.
3696 3723           */
3697 3724          z->zone_proc_initpid = p->p_pid;
3698 3725  
3699 3726          /*
3700 3727           * We maintain zone_boot_err so that we can return the cause of the
3701 3728           * failure back to the caller of the zone_boot syscall.
3702 3729           */
3703 3730          p->p_zone->zone_boot_err = start_init_common();
3704 3731  
3705 3732          /*
3706 3733           * We will prevent booting zones from becoming running zones if the
3707 3734           * global zone is shutting down.
3708 3735           */
3709 3736          mutex_enter(&zone_status_lock);
3710 3737          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3711 3738              ZONE_IS_SHUTTING_DOWN) {
3712 3739                  /*
3713 3740                   * Make sure we are still in the booting state-- we could have
3714 3741                   * raced and already be shutting down, or even further along.
3715 3742                   */
3716 3743                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3717 3744                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3718 3745                  }
3719 3746                  mutex_exit(&zone_status_lock);
3720 3747                  /* It's gone bad, dispose of the process */
3721 3748                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3722 3749                          mutex_enter(&p->p_lock);
3723 3750                          ASSERT(p->p_flag & SEXITLWPS);
3724 3751                          lwp_exit();
3725 3752                  }
3726 3753          } else {
3727 3754                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3728 3755                          zone_status_set(z, ZONE_IS_RUNNING);
3729 3756                  mutex_exit(&zone_status_lock);
3730 3757                  /* cause the process to return to userland. */
3731 3758                  lwp_rtt();
3732 3759          }
3733 3760  }
3734 3761  
3735 3762  struct zsched_arg {
3736 3763          zone_t *zone;
3737 3764          nvlist_t *nvlist;
3738 3765  };
3739 3766  
3740 3767  /*
3741 3768   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3742 3769   * anything to do with scheduling, but rather with the fact that
3743 3770   * per-zone kernel threads are parented to zsched, just like regular
3744 3771   * kernel threads are parented to sched (p0).
3745 3772   *
3746 3773   * zsched is also responsible for launching init for the zone.
3747 3774   */
3748 3775  static void
3749 3776  zsched(void *arg)
3750 3777  {
3751 3778          struct zsched_arg *za = arg;
3752 3779          proc_t *pp = curproc;
3753 3780          proc_t *initp = proc_init;
3754 3781          zone_t *zone = za->zone;
3755 3782          cred_t *cr, *oldcred;
3756 3783          rctl_set_t *set;
3757 3784          rctl_alloc_gp_t *gp;
3758 3785          contract_t *ct = NULL;
3759 3786          task_t *tk, *oldtk;
3760 3787          rctl_entity_p_t e;
3761 3788          kproject_t *pj;
3762 3789  
3763 3790          nvlist_t *nvl = za->nvlist;
3764 3791          nvpair_t *nvp = NULL;
3765 3792  
3766 3793          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3767 3794          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3768 3795          PTOU(pp)->u_argc = 0;
3769 3796          PTOU(pp)->u_argv = NULL;
3770 3797          PTOU(pp)->u_envp = NULL;
3771 3798          closeall(P_FINFO(pp));
3772 3799  
3773 3800          /*
3774 3801           * We are this zone's "zsched" process.  As the zone isn't generally
3775 3802           * visible yet we don't need to grab any locks before initializing its
3776 3803           * zone_proc pointer.
3777 3804           */
3778 3805          zone_hold(zone);  /* this hold is released by zone_destroy() */
3779 3806          zone->zone_zsched = pp;
3780 3807          mutex_enter(&pp->p_lock);
3781 3808          pp->p_zone = zone;
3782 3809          mutex_exit(&pp->p_lock);
3783 3810  
3784 3811          /*
3785 3812           * Disassociate process from its 'parent'; parent ourselves to init
3786 3813           * (pid 1) and change other values as needed.
3787 3814           */
3788 3815          sess_create();
3789 3816  
3790 3817          mutex_enter(&pidlock);
3791 3818          proc_detach(pp);
3792 3819          pp->p_ppid = 1;
3793 3820          pp->p_flag |= SZONETOP;
3794 3821          pp->p_ancpid = 1;
3795 3822          pp->p_parent = initp;
3796 3823          pp->p_psibling = NULL;
3797 3824          if (initp->p_child)
3798 3825                  initp->p_child->p_psibling = pp;
3799 3826          pp->p_sibling = initp->p_child;
3800 3827          initp->p_child = pp;
3801 3828  
3802 3829          /* Decrement what newproc() incremented. */
3803 3830          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3804 3831          /*
3805 3832           * Our credentials are about to become kcred-like, so we don't care
3806 3833           * about the caller's ruid.
3807 3834           */
3808 3835          upcount_inc(crgetruid(kcred), zone->zone_id);
3809 3836          mutex_exit(&pidlock);
3810 3837  
3811 3838          /*
3812 3839           * getting out of global zone, so decrement lwp and process counts
3813 3840           */
3814 3841          pj = pp->p_task->tk_proj;
3815 3842          mutex_enter(&global_zone->zone_nlwps_lock);
3816 3843          pj->kpj_nlwps -= pp->p_lwpcnt;
3817 3844          global_zone->zone_nlwps -= pp->p_lwpcnt;
3818 3845          pj->kpj_nprocs--;
3819 3846          global_zone->zone_nprocs--;
3820 3847          mutex_exit(&global_zone->zone_nlwps_lock);
3821 3848  
3822 3849          /*
3823 3850           * Decrement locked memory counts on old zone and project.
3824 3851           */
3825 3852          mutex_enter(&global_zone->zone_mem_lock);
3826 3853          global_zone->zone_locked_mem -= pp->p_locked_mem;
3827 3854          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3828 3855          mutex_exit(&global_zone->zone_mem_lock);
3829 3856  
3830 3857          /*
3831 3858           * Create and join a new task in project '0' of this zone.
3832 3859           *
3833 3860           * We don't need to call holdlwps() since we know we're the only lwp in
3834 3861           * this process.
3835 3862           *
3836 3863           * task_join() returns with p_lock held.
3837 3864           */
3838 3865          tk = task_create(0, zone);
3839 3866          mutex_enter(&cpu_lock);
3840 3867          oldtk = task_join(tk, 0);
3841 3868  
3842 3869          pj = pp->p_task->tk_proj;
3843 3870  
3844 3871          mutex_enter(&zone->zone_mem_lock);
3845 3872          zone->zone_locked_mem += pp->p_locked_mem;
3846 3873          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3847 3874          mutex_exit(&zone->zone_mem_lock);
3848 3875  
3849 3876          /*
3850 3877           * add lwp and process counts to zsched's zone, and increment
3851 3878           * project's task and process count due to the task created in
3852 3879           * the above task_create.
3853 3880           */
3854 3881          mutex_enter(&zone->zone_nlwps_lock);
3855 3882          pj->kpj_nlwps += pp->p_lwpcnt;
3856 3883          pj->kpj_ntasks += 1;
3857 3884          zone->zone_nlwps += pp->p_lwpcnt;
3858 3885          pj->kpj_nprocs++;
3859 3886          zone->zone_nprocs++;
3860 3887          mutex_exit(&zone->zone_nlwps_lock);
3861 3888  
3862 3889          mutex_exit(&curproc->p_lock);
3863 3890          mutex_exit(&cpu_lock);
3864 3891          task_rele(oldtk);
3865 3892  
3866 3893          /*
3867 3894           * The process was created by a process in the global zone, hence the
3868 3895           * credentials are wrong.  We might as well have kcred-ish credentials.
3869 3896           */
3870 3897          cr = zone->zone_kcred;
3871 3898          crhold(cr);
3872 3899          mutex_enter(&pp->p_crlock);
3873 3900          oldcred = pp->p_cred;
3874 3901          pp->p_cred = cr;
3875 3902          mutex_exit(&pp->p_crlock);
3876 3903          crfree(oldcred);
3877 3904  
3878 3905          /*
3879 3906           * Hold credentials again (for thread)
3880 3907           */
3881 3908          crhold(cr);
3882 3909  
3883 3910          /*
3884 3911           * p_lwpcnt can't change since this is a kernel process.
3885 3912           */
3886 3913          crset(pp, cr);
3887 3914  
3888 3915          /*
3889 3916           * Chroot
3890 3917           */
3891 3918          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3892 3919          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3893 3920  
3894 3921          /*
3895 3922           * Initialize zone's rctl set.
3896 3923           */
3897 3924          set = rctl_set_create();
3898 3925          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3899 3926          mutex_enter(&pp->p_lock);
3900 3927          e.rcep_p.zone = zone;
3901 3928          e.rcep_t = RCENTITY_ZONE;
3902 3929          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3903 3930          mutex_exit(&pp->p_lock);
3904 3931          rctl_prealloc_destroy(gp);
3905 3932  
3906 3933          /*
3907 3934           * Apply the rctls passed in to zone_create().  This is basically a list
3908 3935           * assignment: all of the old values are removed and the new ones
3909 3936           * inserted.  That is, if an empty list is passed in, all values are
3910 3937           * removed.
3911 3938           */
3912 3939          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3913 3940                  rctl_dict_entry_t *rde;
3914 3941                  rctl_hndl_t hndl;
3915 3942                  char *name;
3916 3943                  nvlist_t **nvlarray;
3917 3944                  uint_t i, nelem;
3918 3945                  int error;      /* For ASSERT()s */
3919 3946  
3920 3947                  name = nvpair_name(nvp);
3921 3948                  hndl = rctl_hndl_lookup(name);
3922 3949                  ASSERT(hndl != -1);
3923 3950                  rde = rctl_dict_lookup_hndl(hndl);
3924 3951                  ASSERT(rde != NULL);
3925 3952  
3926 3953                  for (; /* ever */; ) {
3927 3954                          rctl_val_t oval;
3928 3955  
3929 3956                          mutex_enter(&pp->p_lock);
3930 3957                          error = rctl_local_get(hndl, NULL, &oval, pp);
3931 3958                          mutex_exit(&pp->p_lock);
3932 3959                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3933 3960                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3934 3961                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
3935 3962                                  break;
3936 3963                          mutex_enter(&pp->p_lock);
3937 3964                          error = rctl_local_delete(hndl, &oval, pp);
3938 3965                          mutex_exit(&pp->p_lock);
3939 3966                          ASSERT(error == 0);
3940 3967                  }
3941 3968                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3942 3969                  ASSERT(error == 0);
3943 3970                  for (i = 0; i < nelem; i++) {
3944 3971                          rctl_val_t *nvalp;
3945 3972  
3946 3973                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3947 3974                          error = nvlist2rctlval(nvlarray[i], nvalp);
3948 3975                          ASSERT(error == 0);

↓ open down ↓

1441 lines elided

↑ open up ↑

3949 3976                          /*
3950 3977                           * rctl_local_insert can fail if the value being
3951 3978                           * inserted is a duplicate; this is OK.
3952 3979                           */
3953 3980                          mutex_enter(&pp->p_lock);
3954 3981                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
3955 3982                                  kmem_cache_free(rctl_val_cache, nvalp);
3956 3983                          mutex_exit(&pp->p_lock);
3957 3984                  }
3958 3985          }
     3986 +
3959 3987          /*
3960 3988           * Tell the world that we're done setting up.
3961 3989           *
3962 3990           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3963 3991           * and atomically set the zone's processor set visibility.  Once
3964 3992           * we drop pool_lock() this zone will automatically get updated
3965 3993           * to reflect any future changes to the pools configuration.
3966 3994           *
3967 3995           * Note that after we drop the locks below (zonehash_lock in
3968 3996           * particular) other operations such as a zone_getattr call can

3969 3997           * now proceed and observe the zone. That is the reason for doing a
3970 3998           * state transition to the INITIALIZED state.
3971 3999           */
3972 4000          pool_lock();
3973 4001          mutex_enter(&cpu_lock);
3974 4002          mutex_enter(&zonehash_lock);
3975 4003          zone_uniqid(zone);
3976 4004          zone_zsd_configure(zone);
3977 4005          if (pool_state == POOL_ENABLED)
3978 4006                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
3979 4007          mutex_enter(&zone_status_lock);
3980 4008          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3981 4009          zone_status_set(zone, ZONE_IS_INITIALIZED);
3982 4010          mutex_exit(&zone_status_lock);
3983 4011          mutex_exit(&zonehash_lock);
3984 4012          mutex_exit(&cpu_lock);
3985 4013          pool_unlock();
3986 4014  
3987 4015          /* Now call the create callback for this key */
3988 4016          zsd_apply_all_keys(zsd_apply_create, zone);
3989 4017  
3990 4018          /* The callbacks are complete. Mark ZONE_IS_READY */
3991 4019          mutex_enter(&zone_status_lock);
3992 4020          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3993 4021          zone_status_set(zone, ZONE_IS_READY);
3994 4022          mutex_exit(&zone_status_lock);
3995 4023  
3996 4024          /*
3997 4025           * Once we see the zone transition to the ZONE_IS_BOOTING state,
3998 4026           * we launch init, and set the state to running.
3999 4027           */
4000 4028          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4001 4029  
4002 4030          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4003 4031                  id_t cid;
4004 4032  
4005 4033                  /*
4006 4034                   * Ok, this is a little complicated.  We need to grab the
4007 4035                   * zone's pool's scheduling class ID; note that by now, we
4008 4036                   * are already bound to a pool if we need to be (zoneadmd
4009 4037                   * will have done that to us while we're in the READY
4010 4038                   * state).  *But* the scheduling class for the zone's 'init'
4011 4039                   * must be explicitly passed to newproc, which doesn't
4012 4040                   * respect pool bindings.
4013 4041                   *
4014 4042                   * We hold the pool_lock across the call to newproc() to
4015 4043                   * close the obvious race: the pool's scheduling class
4016 4044                   * could change before we manage to create the LWP with
4017 4045                   * classid 'cid'.
4018 4046                   */
4019 4047                  pool_lock();
4020 4048                  if (zone->zone_defaultcid > 0)
4021 4049                          cid = zone->zone_defaultcid;
4022 4050                  else
4023 4051                          cid = pool_get_class(zone->zone_pool);
4024 4052                  if (cid == -1)
4025 4053                          cid = defaultcid;
4026 4054  
4027 4055                  /*
4028 4056                   * If this fails, zone_boot will ultimately fail.  The
4029 4057                   * state of the zone will be set to SHUTTING_DOWN-- userland
4030 4058                   * will have to tear down the zone, and fail, or try again.
4031 4059                   */
4032 4060                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4033 4061                      minclsyspri - 1, &ct, 0)) != 0) {
4034 4062                          mutex_enter(&zone_status_lock);
4035 4063                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4036 4064                          mutex_exit(&zone_status_lock);
4037 4065                  } else {
4038 4066                          zone->zone_boot_time = gethrestime_sec();
4039 4067                  }
4040 4068  
4041 4069                  pool_unlock();
4042 4070          }
4043 4071  
4044 4072          /*
4045 4073           * Wait for zone_destroy() to be called.  This is what we spend
4046 4074           * most of our life doing.
4047 4075           */
4048 4076          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4049 4077  
4050 4078          if (ct)
4051 4079                  /*
4052 4080                   * At this point the process contract should be empty.
4053 4081                   * (Though if it isn't, it's not the end of the world.)
4054 4082                   */
4055 4083                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4056 4084  
4057 4085          /*
4058 4086           * Allow kcred to be freed when all referring processes
4059 4087           * (including this one) go away.  We can't just do this in
4060 4088           * zone_free because we need to wait for the zone_cred_ref to
4061 4089           * drop to 0 before calling zone_free, and the existence of
4062 4090           * zone_kcred will prevent that.  Thus, we call crfree here to
4063 4091           * balance the crdup in zone_create.  The crhold calls earlier
4064 4092           * in zsched will be dropped when the thread and process exit.
4065 4093           */
4066 4094          crfree(zone->zone_kcred);
4067 4095          zone->zone_kcred = NULL;
4068 4096  
4069 4097          exit(CLD_EXITED, 0);
4070 4098  }
4071 4099  
4072 4100  /*
4073 4101   * Helper function to determine if there are any submounts of the
4074 4102   * provided path.  Used to make sure the zone doesn't "inherit" any
4075 4103   * mounts from before it is created.
4076 4104   */
4077 4105  static uint_t
4078 4106  zone_mount_count(const char *rootpath)
4079 4107  {
4080 4108          vfs_t *vfsp;
4081 4109          uint_t count = 0;
4082 4110          size_t rootpathlen = strlen(rootpath);
4083 4111  
4084 4112          /*
4085 4113           * Holding zonehash_lock prevents race conditions with
4086 4114           * vfs_list_add()/vfs_list_remove() since we serialize with
4087 4115           * zone_find_by_path().
4088 4116           */
4089 4117          ASSERT(MUTEX_HELD(&zonehash_lock));
4090 4118          /*
4091 4119           * The rootpath must end with a '/'
4092 4120           */
4093 4121          ASSERT(rootpath[rootpathlen - 1] == '/');
4094 4122  
4095 4123          /*
4096 4124           * This intentionally does not count the rootpath itself if that
4097 4125           * happens to be a mount point.
4098 4126           */
4099 4127          vfs_list_read_lock();
4100 4128          vfsp = rootvfs;
4101 4129          do {
4102 4130                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4103 4131                      rootpathlen) == 0)
4104 4132                          count++;
4105 4133                  vfsp = vfsp->vfs_next;
4106 4134          } while (vfsp != rootvfs);
4107 4135          vfs_list_unlock();
4108 4136          return (count);
4109 4137  }
4110 4138  
4111 4139  /*
4112 4140   * Helper function to make sure that a zone created on 'rootpath'
4113 4141   * wouldn't end up containing other zones' rootpaths.
4114 4142   */
4115 4143  static boolean_t
4116 4144  zone_is_nested(const char *rootpath)
4117 4145  {
4118 4146          zone_t *zone;
4119 4147          size_t rootpathlen = strlen(rootpath);
4120 4148          size_t len;
4121 4149  
4122 4150          ASSERT(MUTEX_HELD(&zonehash_lock));
4123 4151  
4124 4152          /*
4125 4153           * zone_set_root() appended '/' and '\0' at the end of rootpath
4126 4154           */
4127 4155          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4128 4156              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4129 4157                  return (B_TRUE);
4130 4158  
4131 4159          for (zone = list_head(&zone_active); zone != NULL;
4132 4160              zone = list_next(&zone_active, zone)) {
4133 4161                  if (zone == global_zone)
4134 4162                          continue;
4135 4163                  len = strlen(zone->zone_rootpath);
4136 4164                  if (strncmp(rootpath, zone->zone_rootpath,
4137 4165                      MIN(rootpathlen, len)) == 0)
4138 4166                          return (B_TRUE);
4139 4167          }
4140 4168          return (B_FALSE);
4141 4169  }
4142 4170  
4143 4171  static int
4144 4172  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4145 4173      size_t zone_privssz)
4146 4174  {
4147 4175          priv_set_t *privs;
4148 4176  
4149 4177          if (zone_privssz < sizeof (priv_set_t))
4150 4178                  return (ENOMEM);
4151 4179  
4152 4180          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4153 4181  
4154 4182          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4155 4183                  kmem_free(privs, sizeof (priv_set_t));
4156 4184                  return (EFAULT);
4157 4185          }
4158 4186  
4159 4187          zone->zone_privset = privs;
4160 4188          return (0);
4161 4189  }
4162 4190  
4163 4191  /*
4164 4192   * We make creative use of nvlists to pass in rctls from userland.  The list is
4165 4193   * a list of the following structures:
4166 4194   *
4167 4195   * (name = rctl_name, value = nvpair_list_array)
4168 4196   *
4169 4197   * Where each element of the nvpair_list_array is of the form:
4170 4198   *
4171 4199   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4172 4200   *      (name = "limit", value = uint64_t),
4173 4201   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4174 4202   */
4175 4203  static int
4176 4204  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4177 4205  {
4178 4206          nvpair_t *nvp = NULL;
4179 4207          nvlist_t *nvl = NULL;
4180 4208          char *kbuf;
4181 4209          int error;
4182 4210          rctl_val_t rv;
4183 4211  
4184 4212          *nvlp = NULL;
4185 4213  
4186 4214          if (buflen == 0)
4187 4215                  return (0);
4188 4216  
4189 4217          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4190 4218                  return (ENOMEM);
4191 4219          if (copyin(ubuf, kbuf, buflen)) {
4192 4220                  error = EFAULT;
4193 4221                  goto out;
4194 4222          }
4195 4223          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4196 4224                  /*
4197 4225                   * nvl may have been allocated/free'd, but the value set to
4198 4226                   * non-NULL, so we reset it here.
4199 4227                   */
4200 4228                  nvl = NULL;
4201 4229                  error = EINVAL;
4202 4230                  goto out;
4203 4231          }
4204 4232          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4205 4233                  rctl_dict_entry_t *rde;
4206 4234                  rctl_hndl_t hndl;
4207 4235                  nvlist_t **nvlarray;
4208 4236                  uint_t i, nelem;
4209 4237                  char *name;
4210 4238  
4211 4239                  error = EINVAL;
4212 4240                  name = nvpair_name(nvp);
4213 4241                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4214 4242                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4215 4243                          goto out;
4216 4244                  }
4217 4245                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4218 4246                          goto out;
4219 4247                  }
4220 4248                  rde = rctl_dict_lookup_hndl(hndl);
4221 4249                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4222 4250                  ASSERT(error == 0);
4223 4251                  for (i = 0; i < nelem; i++) {
4224 4252                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4225 4253                                  goto out;
4226 4254                  }
4227 4255                  if (rctl_invalid_value(rde, &rv)) {
4228 4256                          error = EINVAL;
4229 4257                          goto out;
4230 4258                  }
4231 4259          }

↓ open down ↓

263 lines elided

↑ open up ↑

4232 4260          error = 0;
4233 4261          *nvlp = nvl;
4234 4262  out:
4235 4263          kmem_free(kbuf, buflen);
4236 4264          if (error && nvl != NULL)
4237 4265                  nvlist_free(nvl);
4238 4266          return (error);
4239 4267  }
4240 4268  
4241 4269  int
4242      -zone_create_error(int er_error, int er_ext, int *er_out) {
     4270 +zone_create_error(int er_error, int er_ext, int *er_out)
     4271 +{
4243 4272          if (er_out != NULL) {
4244 4273                  if (copyout(&er_ext, er_out, sizeof (int))) {
4245 4274                          return (set_errno(EFAULT));
4246 4275                  }
4247 4276          }
4248 4277          return (set_errno(er_error));
4249 4278  }
4250 4279  
4251 4280  static int
4252 4281  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)

4253 4282  {
4254 4283          ts_label_t *tsl;
4255 4284          bslabel_t blab;
4256 4285  
4257 4286          /* Get label from user */
4258 4287          if (copyin(lab, &blab, sizeof (blab)) != 0)
4259 4288                  return (EFAULT);
4260 4289          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4261 4290          if (tsl == NULL)
4262 4291                  return (ENOMEM);
4263 4292  
4264 4293          zone->zone_slabel = tsl;
4265 4294          return (0);
4266 4295  }
4267 4296  
4268 4297  /*
4269 4298   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4270 4299   */
4271 4300  static int
4272 4301  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4273 4302  {
4274 4303          char *kbuf;
4275 4304          char *dataset, *next;
4276 4305          zone_dataset_t *zd;
4277 4306          size_t len;
4278 4307  
4279 4308          if (ubuf == NULL || buflen == 0)
4280 4309                  return (0);
4281 4310  
4282 4311          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4283 4312                  return (ENOMEM);
4284 4313  
4285 4314          if (copyin(ubuf, kbuf, buflen) != 0) {
4286 4315                  kmem_free(kbuf, buflen);
4287 4316                  return (EFAULT);
4288 4317          }
4289 4318  
4290 4319          dataset = next = kbuf;
4291 4320          for (;;) {
4292 4321                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4293 4322  
4294 4323                  next = strchr(dataset, ',');
4295 4324  
4296 4325                  if (next == NULL)
4297 4326                          len = strlen(dataset);
4298 4327                  else
4299 4328                          len = next - dataset;
4300 4329  
4301 4330                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4302 4331                  bcopy(dataset, zd->zd_dataset, len);
4303 4332                  zd->zd_dataset[len] = '\0';
4304 4333  
4305 4334                  list_insert_head(&zone->zone_datasets, zd);
4306 4335  
4307 4336                  if (next == NULL)
4308 4337                          break;
4309 4338  
4310 4339                  dataset = next + 1;
4311 4340          }
4312 4341  
4313 4342          kmem_free(kbuf, buflen);
4314 4343          return (0);
4315 4344  }
4316 4345  
4317 4346  /*
4318 4347   * System call to create/initialize a new zone named 'zone_name', rooted
4319 4348   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4320 4349   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4321 4350   * with labeling set by 'match', 'doi', and 'label'.
4322 4351   *
4323 4352   * If extended error is non-null, we may use it to return more detailed
4324 4353   * error information.
4325 4354   */
4326 4355  static zoneid_t
4327 4356  zone_create(const char *zone_name, const char *zone_root,
4328 4357      const priv_set_t *zone_privs, size_t zone_privssz,
4329 4358      caddr_t rctlbuf, size_t rctlbufsz,
4330 4359      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4331 4360      int match, uint32_t doi, const bslabel_t *label,
4332 4361      int flags)
4333 4362  {
4334 4363          struct zsched_arg zarg;
4335 4364          nvlist_t *rctls = NULL;
4336 4365          proc_t *pp = curproc;
4337 4366          zone_t *zone, *ztmp;
4338 4367          zoneid_t zoneid;
4339 4368          int error;
4340 4369          int error2 = 0;
4341 4370          char *str;
4342 4371          cred_t *zkcr;
4343 4372          boolean_t insert_label_hash;
4344 4373  
4345 4374          if (secpolicy_zone_config(CRED()) != 0)
4346 4375                  return (set_errno(EPERM));
4347 4376  
4348 4377          /* can't boot zone from within chroot environment */
4349 4378          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4350 4379                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4351 4380                      extended_error));
4352 4381  
4353 4382          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4354 4383          zoneid = zone->zone_id = id_alloc(zoneid_space);
4355 4384          zone->zone_status = ZONE_IS_UNINITIALIZED;
4356 4385          zone->zone_pool = pool_default;
4357 4386          zone->zone_pool_mod = gethrtime();
4358 4387          zone->zone_psetid = ZONE_PS_INVAL;
4359 4388          zone->zone_ncpus = 0;
4360 4389          zone->zone_ncpus_online = 0;
4361 4390          zone->zone_restart_init = B_TRUE;
4362 4391          zone->zone_brand = &native_brand;
4363 4392          zone->zone_initname = NULL;
4364 4393          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4365 4394          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4366 4395          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4367 4396          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4368 4397          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4369 4398              offsetof(zone_ref_t, zref_linkage));
4370 4399          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4371 4400              offsetof(struct zsd_entry, zsd_linkage));
4372 4401          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4373 4402              offsetof(zone_dataset_t, zd_linkage));
4374 4403          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4375 4404              offsetof(zone_dl_t, zdl_linkage));
4376 4405          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4377 4406          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4378 4407  
4379 4408          if (flags & ZCF_NET_EXCL) {
4380 4409                  zone->zone_flags |= ZF_NET_EXCL;
4381 4410          }
4382 4411  
4383 4412          if ((error = zone_set_name(zone, zone_name)) != 0) {
4384 4413                  zone_free(zone);
4385 4414                  return (zone_create_error(error, 0, extended_error));
4386 4415          }
4387 4416  
4388 4417          if ((error = zone_set_root(zone, zone_root)) != 0) {
4389 4418                  zone_free(zone);
4390 4419                  return (zone_create_error(error, 0, extended_error));
4391 4420          }
4392 4421          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4393 4422                  zone_free(zone);
4394 4423                  return (zone_create_error(error, 0, extended_error));
4395 4424          }
4396 4425  
4397 4426          /* initialize node name to be the same as zone name */
4398 4427          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4399 4428          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4400 4429          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4401 4430

↓ open down ↓

149 lines elided

↑ open up ↑

4402 4431          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4403 4432          zone->zone_domain[0] = '\0';
4404 4433          zone->zone_hostid = HW_INVALID_HOSTID;
4405 4434          zone->zone_shares = 1;
4406 4435          zone->zone_shmmax = 0;
4407 4436          zone->zone_ipc.ipcq_shmmni = 0;
4408 4437          zone->zone_ipc.ipcq_semmni = 0;
4409 4438          zone->zone_ipc.ipcq_msgmni = 0;
4410 4439          zone->zone_bootargs = NULL;
4411 4440          zone->zone_fs_allowed = NULL;
     4441 +
     4442 +        secflags_zero(&zone0.zone_secflags.psf_lower);
     4443 +        secflags_zero(&zone0.zone_secflags.psf_effective);
     4444 +        secflags_zero(&zone0.zone_secflags.psf_inherit);
     4445 +        secflags_fullset(&zone0.zone_secflags.psf_upper);
     4446 +
4412 4447          zone->zone_initname =
4413 4448              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4414 4449          (void) strcpy(zone->zone_initname, zone_default_initname);
4415 4450          zone->zone_nlwps = 0;
4416 4451          zone->zone_nlwps_ctl = INT_MAX;
4417 4452          zone->zone_nprocs = 0;
4418 4453          zone->zone_nprocs_ctl = INT_MAX;
4419 4454          zone->zone_locked_mem = 0;
4420 4455          zone->zone_locked_mem_ctl = UINT64_MAX;
4421 4456          zone->zone_max_swap = 0;

4422 4457          zone->zone_max_swap_ctl = UINT64_MAX;
4423 4458          zone->zone_max_lofi = 0;
4424 4459          zone->zone_max_lofi_ctl = UINT64_MAX;
4425 4460          zone0.zone_lockedmem_kstat = NULL;
4426 4461          zone0.zone_swapresv_kstat = NULL;
4427 4462  
4428 4463          /*
4429 4464           * Zsched initializes the rctls.
4430 4465           */
4431 4466          zone->zone_rctls = NULL;
4432 4467  
4433 4468          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4434 4469                  zone_free(zone);
4435 4470                  return (zone_create_error(error, 0, extended_error));
4436 4471          }
4437 4472  
4438 4473          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4439 4474                  zone_free(zone);
4440 4475                  return (set_errno(error));
4441 4476          }
4442 4477  
4443 4478          /*
4444 4479           * Read in the trusted system parameters:
4445 4480           * match flag and sensitivity label.
4446 4481           */
4447 4482          zone->zone_match = match;
4448 4483          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4449 4484                  /* Fail if requested to set doi to anything but system's doi */
4450 4485                  if (doi != 0 && doi != default_doi) {
4451 4486                          zone_free(zone);
4452 4487                          return (set_errno(EINVAL));
4453 4488                  }
4454 4489                  /* Always apply system's doi to the zone */
4455 4490                  error = zone_set_label(zone, label, default_doi);
4456 4491                  if (error != 0) {
4457 4492                          zone_free(zone);
4458 4493                          return (set_errno(error));
4459 4494                  }
4460 4495                  insert_label_hash = B_TRUE;
4461 4496          } else {
4462 4497                  /* all zones get an admin_low label if system is not labeled */
4463 4498                  zone->zone_slabel = l_admin_low;
4464 4499                  label_hold(l_admin_low);
4465 4500                  insert_label_hash = B_FALSE;
4466 4501          }
4467 4502  
4468 4503          /*
4469 4504           * Stop all lwps since that's what normally happens as part of fork().
4470 4505           * This needs to happen before we grab any locks to avoid deadlock
4471 4506           * (another lwp in the process could be waiting for the held lock).
4472 4507           */
4473 4508          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4474 4509                  zone_free(zone);
4475 4510                  nvlist_free(rctls);
4476 4511                  return (zone_create_error(error, 0, extended_error));
4477 4512          }
4478 4513  
4479 4514          if (block_mounts(zone) == 0) {
4480 4515                  mutex_enter(&pp->p_lock);
4481 4516                  if (curthread != pp->p_agenttp)
4482 4517                          continuelwps(pp);
4483 4518                  mutex_exit(&pp->p_lock);
4484 4519                  zone_free(zone);
4485 4520                  nvlist_free(rctls);
4486 4521                  return (zone_create_error(error, 0, extended_error));
4487 4522          }
4488 4523  
4489 4524          /*
4490 4525           * Set up credential for kernel access.  After this, any errors
4491 4526           * should go through the dance in errout rather than calling
4492 4527           * zone_free directly.
4493 4528           */
4494 4529          zone->zone_kcred = crdup(kcred);
4495 4530          crsetzone(zone->zone_kcred, zone);
4496 4531          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4497 4532          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4498 4533          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4499 4534          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4500 4535  
4501 4536          mutex_enter(&zonehash_lock);
4502 4537          /*
4503 4538           * Make sure zone doesn't already exist.
4504 4539           *
4505 4540           * If the system and zone are labeled,
4506 4541           * make sure no other zone exists that has the same label.
4507 4542           */
4508 4543          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4509 4544              (insert_label_hash &&
4510 4545              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4511 4546                  zone_status_t status;
4512 4547  
4513 4548                  status = zone_status_get(ztmp);
4514 4549                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4515 4550                          error = EEXIST;
4516 4551                  else
4517 4552                          error = EBUSY;
4518 4553  
4519 4554                  if (insert_label_hash)
4520 4555                          error2 = ZE_LABELINUSE;
4521 4556  
4522 4557                  goto errout;
4523 4558          }
4524 4559  
4525 4560          /*
4526 4561           * Don't allow zone creations which would cause one zone's rootpath to
4527 4562           * be accessible from that of another (non-global) zone.
4528 4563           */
4529 4564          if (zone_is_nested(zone->zone_rootpath)) {
4530 4565                  error = EBUSY;
4531 4566                  goto errout;
4532 4567          }
4533 4568  
4534 4569          ASSERT(zonecount != 0);         /* check for leaks */
4535 4570          if (zonecount + 1 > maxzones) {
4536 4571                  error = ENOMEM;
4537 4572                  goto errout;
4538 4573          }
4539 4574  
4540 4575          if (zone_mount_count(zone->zone_rootpath) != 0) {
4541 4576                  error = EBUSY;
4542 4577                  error2 = ZE_AREMOUNTS;
4543 4578                  goto errout;
4544 4579          }
4545 4580  
4546 4581          /*
4547 4582           * Zone is still incomplete, but we need to drop all locks while
4548 4583           * zsched() initializes this zone's kernel process.  We
4549 4584           * optimistically add the zone to the hashtable and associated
4550 4585           * lists so a parallel zone_create() doesn't try to create the
4551 4586           * same zone.
4552 4587           */
4553 4588          zonecount++;
4554 4589          (void) mod_hash_insert(zonehashbyid,
4555 4590              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4556 4591              (mod_hash_val_t)(uintptr_t)zone);
4557 4592          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4558 4593          (void) strcpy(str, zone->zone_name);
4559 4594          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4560 4595              (mod_hash_val_t)(uintptr_t)zone);
4561 4596          if (insert_label_hash) {
4562 4597                  (void) mod_hash_insert(zonehashbylabel,
4563 4598                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4564 4599                  zone->zone_flags |= ZF_HASHED_LABEL;
4565 4600          }
4566 4601  
4567 4602          /*
4568 4603           * Insert into active list.  At this point there are no 'hold's
4569 4604           * on the zone, but everyone else knows not to use it, so we can
4570 4605           * continue to use it.  zsched() will do a zone_hold() if the
4571 4606           * newproc() is successful.
4572 4607           */
4573 4608          list_insert_tail(&zone_active, zone);
4574 4609          mutex_exit(&zonehash_lock);
4575 4610  
4576 4611          zarg.zone = zone;
4577 4612          zarg.nvlist = rctls;
4578 4613          /*
4579 4614           * The process, task, and project rctls are probably wrong;
4580 4615           * we need an interface to get the default values of all rctls,
4581 4616           * and initialize zsched appropriately.  I'm not sure that that
4582 4617           * makes much of a difference, though.
4583 4618           */
4584 4619          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4585 4620          if (error != 0) {
4586 4621                  /*
4587 4622                   * We need to undo all globally visible state.
4588 4623                   */
4589 4624                  mutex_enter(&zonehash_lock);
4590 4625                  list_remove(&zone_active, zone);
4591 4626                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4592 4627                          ASSERT(zone->zone_slabel != NULL);
4593 4628                          (void) mod_hash_destroy(zonehashbylabel,
4594 4629                              (mod_hash_key_t)zone->zone_slabel);
4595 4630                  }
4596 4631                  (void) mod_hash_destroy(zonehashbyname,
4597 4632                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4598 4633                  (void) mod_hash_destroy(zonehashbyid,
4599 4634                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4600 4635                  ASSERT(zonecount > 1);
4601 4636                  zonecount--;
4602 4637                  goto errout;
4603 4638          }
4604 4639  
4605 4640          /*
4606 4641           * Zone creation can't fail from now on.
4607 4642           */
4608 4643  
4609 4644          /*
4610 4645           * Create zone kstats
4611 4646           */
4612 4647          zone_kstat_create(zone);
4613 4648  
4614 4649          /*
4615 4650           * Let the other lwps continue.
4616 4651           */
4617 4652          mutex_enter(&pp->p_lock);
4618 4653          if (curthread != pp->p_agenttp)
4619 4654                  continuelwps(pp);
4620 4655          mutex_exit(&pp->p_lock);
4621 4656  
4622 4657          /*
4623 4658           * Wait for zsched to finish initializing the zone.
4624 4659           */
4625 4660          zone_status_wait(zone, ZONE_IS_READY);
4626 4661          /*
4627 4662           * The zone is fully visible, so we can let mounts progress.
4628 4663           */
4629 4664          resume_mounts(zone);
4630 4665          nvlist_free(rctls);
4631 4666  
4632 4667          return (zoneid);
4633 4668  
4634 4669  errout:
4635 4670          mutex_exit(&zonehash_lock);
4636 4671          /*
4637 4672           * Let the other lwps continue.
4638 4673           */
4639 4674          mutex_enter(&pp->p_lock);
4640 4675          if (curthread != pp->p_agenttp)
4641 4676                  continuelwps(pp);
4642 4677          mutex_exit(&pp->p_lock);
4643 4678  
4644 4679          resume_mounts(zone);
4645 4680          nvlist_free(rctls);
4646 4681          /*
4647 4682           * There is currently one reference to the zone, a cred_ref from
4648 4683           * zone_kcred.  To free the zone, we call crfree, which will call
4649 4684           * zone_cred_rele, which will call zone_free.
4650 4685           */
4651 4686          ASSERT(zone->zone_cred_ref == 1);
4652 4687          ASSERT(zone->zone_kcred->cr_ref == 1);
4653 4688          ASSERT(zone->zone_ref == 0);
4654 4689          zkcr = zone->zone_kcred;
4655 4690          zone->zone_kcred = NULL;
4656 4691          crfree(zkcr);                           /* triggers call to zone_free */
4657 4692          return (zone_create_error(error, error2, extended_error));
4658 4693  }
4659 4694  
4660 4695  /*
4661 4696   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4662 4697   * the heavy lifting.  initname is the path to the program to launch
4663 4698   * at the "top" of the zone; if this is NULL, we use the system default,
4664 4699   * which is stored at zone_default_initname.
4665 4700   */
4666 4701  static int
4667 4702  zone_boot(zoneid_t zoneid)
4668 4703  {
4669 4704          int err;
4670 4705          zone_t *zone;
4671 4706  
4672 4707          if (secpolicy_zone_config(CRED()) != 0)
4673 4708                  return (set_errno(EPERM));
4674 4709          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4675 4710                  return (set_errno(EINVAL));
4676 4711  
4677 4712          mutex_enter(&zonehash_lock);
4678 4713          /*
4679 4714           * Look for zone under hash lock to prevent races with calls to
4680 4715           * zone_shutdown, zone_destroy, etc.
4681 4716           */
4682 4717          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4683 4718                  mutex_exit(&zonehash_lock);
4684 4719                  return (set_errno(EINVAL));
4685 4720          }
4686 4721  
4687 4722          mutex_enter(&zone_status_lock);
4688 4723          if (zone_status_get(zone) != ZONE_IS_READY) {
4689 4724                  mutex_exit(&zone_status_lock);
4690 4725                  mutex_exit(&zonehash_lock);
4691 4726                  return (set_errno(EINVAL));
4692 4727          }
4693 4728          zone_status_set(zone, ZONE_IS_BOOTING);
4694 4729          mutex_exit(&zone_status_lock);
4695 4730  
4696 4731          zone_hold(zone);        /* so we can use the zone_t later */
4697 4732          mutex_exit(&zonehash_lock);
4698 4733  
4699 4734          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4700 4735                  zone_rele(zone);
4701 4736                  return (set_errno(EINTR));
4702 4737          }
4703 4738  
4704 4739          /*
4705 4740           * Boot (starting init) might have failed, in which case the zone
4706 4741           * will go to the SHUTTING_DOWN state; an appropriate errno will
4707 4742           * be placed in zone->zone_boot_err, and so we return that.
4708 4743           */
4709 4744          err = zone->zone_boot_err;
4710 4745          zone_rele(zone);
4711 4746          return (err ? set_errno(err) : 0);
4712 4747  }
4713 4748  
4714 4749  /*
4715 4750   * Kills all user processes in the zone, waiting for them all to exit
4716 4751   * before returning.
4717 4752   */
4718 4753  static int
4719 4754  zone_empty(zone_t *zone)
4720 4755  {
4721 4756          int waitstatus;
4722 4757  
4723 4758          /*
4724 4759           * We need to drop zonehash_lock before killing all
4725 4760           * processes, otherwise we'll deadlock with zone_find_*
4726 4761           * which can be called from the exit path.
4727 4762           */
4728 4763          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4729 4764          while ((waitstatus = zone_status_timedwait_sig(zone,
4730 4765              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4731 4766                  killall(zone->zone_id);
4732 4767          }
4733 4768          /*
4734 4769           * return EINTR if we were signaled
4735 4770           */
4736 4771          if (waitstatus == 0)
4737 4772                  return (EINTR);
4738 4773          return (0);
4739 4774  }
4740 4775  
4741 4776  /*
4742 4777   * This function implements the policy for zone visibility.
4743 4778   *
4744 4779   * In standard Solaris, a non-global zone can only see itself.
4745 4780   *
4746 4781   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4747 4782   * it dominates. For this test, the label of the global zone is treated as
4748 4783   * admin_high so it is special-cased instead of being checked for dominance.
4749 4784   *
4750 4785   * Returns true if zone attributes are viewable, false otherwise.
4751 4786   */
4752 4787  static boolean_t
4753 4788  zone_list_access(zone_t *zone)
4754 4789  {
4755 4790  
4756 4791          if (curproc->p_zone == global_zone ||
4757 4792              curproc->p_zone == zone) {
4758 4793                  return (B_TRUE);
4759 4794          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4760 4795                  bslabel_t *curproc_label;
4761 4796                  bslabel_t *zone_label;
4762 4797  
4763 4798                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4764 4799                  zone_label = label2bslabel(zone->zone_slabel);
4765 4800  
4766 4801                  if (zone->zone_id != GLOBAL_ZONEID &&
4767 4802                      bldominates(curproc_label, zone_label)) {
4768 4803                          return (B_TRUE);
4769 4804                  } else {
4770 4805                          return (B_FALSE);
4771 4806                  }
4772 4807          } else {
4773 4808                  return (B_FALSE);
4774 4809          }
4775 4810  }
4776 4811  
4777 4812  /*
4778 4813   * Systemcall to start the zone's halt sequence.  By the time this
4779 4814   * function successfully returns, all user processes and kernel threads
4780 4815   * executing in it will have exited, ZSD shutdown callbacks executed,
4781 4816   * and the zone status set to ZONE_IS_DOWN.
4782 4817   *
4783 4818   * It is possible that the call will interrupt itself if the caller is the
4784 4819   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4785 4820   */
4786 4821  static int
4787 4822  zone_shutdown(zoneid_t zoneid)
4788 4823  {
4789 4824          int error;
4790 4825          zone_t *zone;
4791 4826          zone_status_t status;
4792 4827  
4793 4828          if (secpolicy_zone_config(CRED()) != 0)
4794 4829                  return (set_errno(EPERM));
4795 4830          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4796 4831                  return (set_errno(EINVAL));
4797 4832  
4798 4833          mutex_enter(&zonehash_lock);
4799 4834          /*
4800 4835           * Look for zone under hash lock to prevent races with other
4801 4836           * calls to zone_shutdown and zone_destroy.
4802 4837           */
4803 4838          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4804 4839                  mutex_exit(&zonehash_lock);
4805 4840                  return (set_errno(EINVAL));
4806 4841          }
4807 4842  
4808 4843          /*
4809 4844           * We have to drop zonehash_lock before calling block_mounts.
4810 4845           * Hold the zone so we can continue to use the zone_t.
4811 4846           */
4812 4847          zone_hold(zone);
4813 4848          mutex_exit(&zonehash_lock);
4814 4849  
4815 4850          /*
4816 4851           * Block mounts so that VFS_MOUNT() can get an accurate view of
4817 4852           * the zone's status with regards to ZONE_IS_SHUTTING down.
4818 4853           *
4819 4854           * e.g. NFS can fail the mount if it determines that the zone
4820 4855           * has already begun the shutdown sequence.
4821 4856           *
4822 4857           */
4823 4858          if (block_mounts(zone) == 0) {
4824 4859                  zone_rele(zone);
4825 4860                  return (set_errno(EINTR));
4826 4861          }
4827 4862  
4828 4863          mutex_enter(&zonehash_lock);
4829 4864          mutex_enter(&zone_status_lock);
4830 4865          status = zone_status_get(zone);
4831 4866          /*
4832 4867           * Fail if the zone isn't fully initialized yet.
4833 4868           */
4834 4869          if (status < ZONE_IS_READY) {
4835 4870                  mutex_exit(&zone_status_lock);
4836 4871                  mutex_exit(&zonehash_lock);
4837 4872                  resume_mounts(zone);
4838 4873                  zone_rele(zone);
4839 4874                  return (set_errno(EINVAL));
4840 4875          }
4841 4876          /*
4842 4877           * If conditions required for zone_shutdown() to return have been met,
4843 4878           * return success.
4844 4879           */
4845 4880          if (status >= ZONE_IS_DOWN) {
4846 4881                  mutex_exit(&zone_status_lock);
4847 4882                  mutex_exit(&zonehash_lock);
4848 4883                  resume_mounts(zone);
4849 4884                  zone_rele(zone);
4850 4885                  return (0);
4851 4886          }
4852 4887          /*
4853 4888           * If zone_shutdown() hasn't been called before, go through the motions.
4854 4889           * If it has, there's nothing to do but wait for the kernel threads to
4855 4890           * drain.
4856 4891           */
4857 4892          if (status < ZONE_IS_EMPTY) {
4858 4893                  uint_t ntasks;
4859 4894  
4860 4895                  mutex_enter(&zone->zone_lock);
4861 4896                  if ((ntasks = zone->zone_ntasks) != 1) {
4862 4897                          /*
4863 4898                           * There's still stuff running.
4864 4899                           */
4865 4900                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4866 4901                  }
4867 4902                  mutex_exit(&zone->zone_lock);
4868 4903                  if (ntasks == 1) {
4869 4904                          /*
4870 4905                           * The only way to create another task is through
4871 4906                           * zone_enter(), which will block until we drop
4872 4907                           * zonehash_lock.  The zone is empty.
4873 4908                           */
4874 4909                          if (zone->zone_kthreads == NULL) {
4875 4910                                  /*
4876 4911                                   * Skip ahead to ZONE_IS_DOWN
4877 4912                                   */
4878 4913                                  zone_status_set(zone, ZONE_IS_DOWN);
4879 4914                          } else {
4880 4915                                  zone_status_set(zone, ZONE_IS_EMPTY);
4881 4916                          }
4882 4917                  }
4883 4918          }
4884 4919          mutex_exit(&zone_status_lock);
4885 4920          mutex_exit(&zonehash_lock);
4886 4921          resume_mounts(zone);
4887 4922  
4888 4923          if (error = zone_empty(zone)) {
4889 4924                  zone_rele(zone);
4890 4925                  return (set_errno(error));
4891 4926          }
4892 4927          /*
4893 4928           * After the zone status goes to ZONE_IS_DOWN this zone will no
4894 4929           * longer be notified of changes to the pools configuration, so
4895 4930           * in order to not end up with a stale pool pointer, we point
4896 4931           * ourselves at the default pool and remove all resource
4897 4932           * visibility.  This is especially important as the zone_t may
4898 4933           * languish on the deathrow for a very long time waiting for
4899 4934           * cred's to drain out.
4900 4935           *
4901 4936           * This rebinding of the zone can happen multiple times
4902 4937           * (presumably due to interrupted or parallel systemcalls)
4903 4938           * without any adverse effects.
4904 4939           */
4905 4940          if (pool_lock_intr() != 0) {
4906 4941                  zone_rele(zone);
4907 4942                  return (set_errno(EINTR));
4908 4943          }
4909 4944          if (pool_state == POOL_ENABLED) {
4910 4945                  mutex_enter(&cpu_lock);
4911 4946                  zone_pool_set(zone, pool_default);
4912 4947                  /*
4913 4948                   * The zone no longer needs to be able to see any cpus.
4914 4949                   */
4915 4950                  zone_pset_set(zone, ZONE_PS_INVAL);
4916 4951                  mutex_exit(&cpu_lock);
4917 4952          }
4918 4953          pool_unlock();
4919 4954  
4920 4955          /*
4921 4956           * ZSD shutdown callbacks can be executed multiple times, hence
4922 4957           * it is safe to not be holding any locks across this call.
4923 4958           */
4924 4959          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4925 4960  
4926 4961          mutex_enter(&zone_status_lock);
4927 4962          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4928 4963                  zone_status_set(zone, ZONE_IS_DOWN);
4929 4964          mutex_exit(&zone_status_lock);
4930 4965  
4931 4966          /*
4932 4967           * Wait for kernel threads to drain.
4933 4968           */
4934 4969          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4935 4970                  zone_rele(zone);
4936 4971                  return (set_errno(EINTR));
4937 4972          }
4938 4973  
4939 4974          /*
4940 4975           * Zone can be become down/destroyable even if the above wait
4941 4976           * returns EINTR, so any code added here may never execute.
4942 4977           * (i.e. don't add code here)
4943 4978           */
4944 4979  
4945 4980          zone_rele(zone);
4946 4981          return (0);
4947 4982  }
4948 4983  
4949 4984  /*
4950 4985   * Log the specified zone's reference counts.  The caller should not be
4951 4986   * holding the zone's zone_lock.
4952 4987   */
4953 4988  static void
4954 4989  zone_log_refcounts(zone_t *zone)
4955 4990  {
4956 4991          char *buffer;
4957 4992          char *buffer_position;
4958 4993          uint32_t buffer_size;
4959 4994          uint32_t index;
4960 4995          uint_t ref;
4961 4996          uint_t cred_ref;
4962 4997  
4963 4998          /*
4964 4999           * Construct a string representing the subsystem-specific reference
4965 5000           * counts.  The counts are printed in ascending order by index into the
4966 5001           * zone_t::zone_subsys_ref array.  The list will be surrounded by
4967 5002           * square brackets [] and will only contain nonzero reference counts.
4968 5003           *
4969 5004           * The buffer will hold two square bracket characters plus ten digits,
4970 5005           * one colon, one space, one comma, and some characters for a
4971 5006           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4972 5007           * bit integers have at most ten decimal digits.)  The last
4973 5008           * reference count's comma is replaced by the closing square
4974 5009           * bracket and a NULL character to terminate the string.
4975 5010           *
4976 5011           * NOTE: We have to grab the zone's zone_lock to create a consistent
4977 5012           * snapshot of the zone's reference counters.
4978 5013           *
4979 5014           * First, figure out how much space the string buffer will need.
4980 5015           * The buffer's size is stored in buffer_size.
4981 5016           */
4982 5017          buffer_size = 2;                        /* for the square brackets */
4983 5018          mutex_enter(&zone->zone_lock);
4984 5019          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4985 5020          ref = zone->zone_ref;
4986 5021          cred_ref = zone->zone_cred_ref;
4987 5022          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4988 5023                  if (zone->zone_subsys_ref[index] != 0)
4989 5024                          buffer_size += strlen(zone_ref_subsys_names[index]) +
4990 5025                              13;
4991 5026          if (buffer_size == 2) {
4992 5027                  /*
4993 5028                   * No subsystems had nonzero reference counts.  Don't bother
4994 5029                   * with allocating a buffer; just log the general-purpose and
4995 5030                   * credential reference counts.
4996 5031                   */
4997 5032                  mutex_exit(&zone->zone_lock);
4998 5033                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4999 5034                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
5000 5035                      "references and %u credential references are still extant",
5001 5036                      zone->zone_name, zone->zone_id, ref, cred_ref);
5002 5037                  return;
5003 5038          }
5004 5039  
5005 5040          /*
5006 5041           * buffer_size contains the exact number of characters that the
5007 5042           * buffer will need.  Allocate the buffer and fill it with nonzero
5008 5043           * subsystem-specific reference counts.  Surround the results with
5009 5044           * square brackets afterwards.
5010 5045           */
5011 5046          buffer = kmem_alloc(buffer_size, KM_SLEEP);
5012 5047          buffer_position = &buffer[1];
5013 5048          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5014 5049                  /*
5015 5050                   * NOTE: The DDI's version of sprintf() returns a pointer to
5016 5051                   * the modified buffer rather than the number of bytes written
5017 5052                   * (as in snprintf(3C)).  This is unfortunate and annoying.
5018 5053                   * Therefore, we'll use snprintf() with INT_MAX to get the
5019 5054                   * number of bytes written.  Using INT_MAX is safe because
5020 5055                   * the buffer is perfectly sized for the data: we'll never
5021 5056                   * overrun the buffer.
5022 5057                   */
5023 5058                  if (zone->zone_subsys_ref[index] != 0)
5024 5059                          buffer_position += snprintf(buffer_position, INT_MAX,
5025 5060                              "%s: %u,", zone_ref_subsys_names[index],
5026 5061                              zone->zone_subsys_ref[index]);
5027 5062          }
5028 5063          mutex_exit(&zone->zone_lock);
5029 5064          buffer[0] = '[';
5030 5065          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5031 5066          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5032 5067          buffer_position[-1] = ']';
5033 5068  
5034 5069          /*
5035 5070           * Log the reference counts and free the message buffer.
5036 5071           */
5037 5072          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5038 5073              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5039 5074              "%u credential references are still extant %s", zone->zone_name,
5040 5075              zone->zone_id, ref, cred_ref, buffer);
5041 5076          kmem_free(buffer, buffer_size);
5042 5077  }
5043 5078  
5044 5079  /*
5045 5080   * Systemcall entry point to finalize the zone halt process.  The caller
5046 5081   * must have already successfully called zone_shutdown().
5047 5082   *
5048 5083   * Upon successful completion, the zone will have been fully destroyed:
5049 5084   * zsched will have exited, destructor callbacks executed, and the zone
5050 5085   * removed from the list of active zones.
5051 5086   */
5052 5087  static int
5053 5088  zone_destroy(zoneid_t zoneid)
5054 5089  {
5055 5090          uint64_t uniqid;
5056 5091          zone_t *zone;
5057 5092          zone_status_t status;
5058 5093          clock_t wait_time;
5059 5094          boolean_t log_refcounts;
5060 5095  
5061 5096          if (secpolicy_zone_config(CRED()) != 0)
5062 5097                  return (set_errno(EPERM));
5063 5098          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5064 5099                  return (set_errno(EINVAL));
5065 5100  
5066 5101          mutex_enter(&zonehash_lock);
5067 5102          /*
5068 5103           * Look for zone under hash lock to prevent races with other
5069 5104           * calls to zone_destroy.
5070 5105           */
5071 5106          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5072 5107                  mutex_exit(&zonehash_lock);
5073 5108                  return (set_errno(EINVAL));
5074 5109          }
5075 5110  
5076 5111          if (zone_mount_count(zone->zone_rootpath) != 0) {
5077 5112                  mutex_exit(&zonehash_lock);
5078 5113                  return (set_errno(EBUSY));
5079 5114          }
5080 5115          mutex_enter(&zone_status_lock);
5081 5116          status = zone_status_get(zone);
5082 5117          if (status < ZONE_IS_DOWN) {
5083 5118                  mutex_exit(&zone_status_lock);
5084 5119                  mutex_exit(&zonehash_lock);
5085 5120                  return (set_errno(EBUSY));
5086 5121          } else if (status == ZONE_IS_DOWN) {
5087 5122                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5088 5123          }
5089 5124          mutex_exit(&zone_status_lock);
5090 5125          zone_hold(zone);
5091 5126          mutex_exit(&zonehash_lock);
5092 5127  
5093 5128          /*
5094 5129           * wait for zsched to exit
5095 5130           */
5096 5131          zone_status_wait(zone, ZONE_IS_DEAD);
5097 5132          zone_zsd_callbacks(zone, ZSD_DESTROY);
5098 5133          zone->zone_netstack = NULL;
5099 5134          uniqid = zone->zone_uniqid;
5100 5135          zone_rele(zone);
5101 5136          zone = NULL;    /* potentially free'd */
5102 5137  
5103 5138          log_refcounts = B_FALSE;
5104 5139          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5105 5140          mutex_enter(&zonehash_lock);
5106 5141          for (; /* ever */; ) {
5107 5142                  boolean_t unref;
5108 5143                  boolean_t refs_have_been_logged;
5109 5144  
5110 5145                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5111 5146                      zone->zone_uniqid != uniqid) {
5112 5147                          /*
5113 5148                           * The zone has gone away.  Necessary conditions
5114 5149                           * are met, so we return success.
5115 5150                           */
5116 5151                          mutex_exit(&zonehash_lock);
5117 5152                          return (0);
5118 5153                  }
5119 5154                  mutex_enter(&zone->zone_lock);
5120 5155                  unref = ZONE_IS_UNREF(zone);
5121 5156                  refs_have_been_logged = (zone->zone_flags &
5122 5157                      ZF_REFCOUNTS_LOGGED);
5123 5158                  mutex_exit(&zone->zone_lock);
5124 5159                  if (unref) {
5125 5160                          /*
5126 5161                           * There is only one reference to the zone -- that
5127 5162                           * added when the zone was added to the hashtables --
5128 5163                           * and things will remain this way until we drop
5129 5164                           * zonehash_lock... we can go ahead and cleanup the
5130 5165                           * zone.
5131 5166                           */
5132 5167                          break;
5133 5168                  }
5134 5169  
5135 5170                  /*
5136 5171                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5137 5172                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5138 5173                   * some zone's general-purpose reference count reaches one.
5139 5174                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5140 5175                   * on zone_destroy_cv, then log the zone's reference counts and
5141 5176                   * continue to wait for zone_rele() and zone_cred_rele().
5142 5177                   */
5143 5178                  if (!refs_have_been_logged) {
5144 5179                          if (!log_refcounts) {
5145 5180                                  /*
5146 5181                                   * This thread hasn't timed out waiting on
5147 5182                                   * zone_destroy_cv yet.  Wait wait_time clock
5148 5183                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5149 5184                                   * seconds) for the zone's references to clear.
5150 5185                                   */
5151 5186                                  ASSERT(wait_time > 0);
5152 5187                                  wait_time = cv_reltimedwait_sig(
5153 5188                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5154 5189                                      TR_SEC);
5155 5190                                  if (wait_time > 0) {
5156 5191                                          /*
5157 5192                                           * A thread in zone_rele() or
5158 5193                                           * zone_cred_rele() signaled
5159 5194                                           * zone_destroy_cv before this thread's
5160 5195                                           * wait timed out.  The zone might have
5161 5196                                           * only one reference left; find out!
5162 5197                                           */
5163 5198                                          continue;
5164 5199                                  } else if (wait_time == 0) {
5165 5200                                          /* The thread's process was signaled. */
5166 5201                                          mutex_exit(&zonehash_lock);
5167 5202                                          return (set_errno(EINTR));
5168 5203                                  }
5169 5204  
5170 5205                                  /*
5171 5206                                   * The thread timed out while waiting on
5172 5207                                   * zone_destroy_cv.  Even though the thread
5173 5208                                   * timed out, it has to check whether another
5174 5209                                   * thread woke up from zone_destroy_cv and
5175 5210                                   * destroyed the zone.
5176 5211                                   *
5177 5212                                   * If the zone still exists and has more than
5178 5213                                   * one unreleased general-purpose reference,
5179 5214                                   * then log the zone's reference counts.
5180 5215                                   */
5181 5216                                  log_refcounts = B_TRUE;
5182 5217                                  continue;
5183 5218                          }
5184 5219  
5185 5220                          /*
5186 5221                           * The thread already timed out on zone_destroy_cv while
5187 5222                           * waiting for subsystems to release the zone's last
5188 5223                           * general-purpose references.  Log the zone's reference
5189 5224                           * counts and wait indefinitely on zone_destroy_cv.
5190 5225                           */
5191 5226                          zone_log_refcounts(zone);
5192 5227                  }
5193 5228                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5194 5229                          /* The thread's process was signaled. */
5195 5230                          mutex_exit(&zonehash_lock);
5196 5231                          return (set_errno(EINTR));
5197 5232                  }
5198 5233          }
5199 5234  
5200 5235          /*
5201 5236           * Remove CPU cap for this zone now since we're not going to
5202 5237           * fail below this point.
5203 5238           */
5204 5239          cpucaps_zone_remove(zone);
5205 5240  
5206 5241          /* Get rid of the zone's kstats */
5207 5242          zone_kstat_delete(zone);
5208 5243  
5209 5244          /* remove the pfexecd doors */
5210 5245          if (zone->zone_pfexecd != NULL) {
5211 5246                  klpd_freelist(&zone->zone_pfexecd);
5212 5247                  zone->zone_pfexecd = NULL;
5213 5248          }
5214 5249  
5215 5250          /* free brand specific data */
5216 5251          if (ZONE_IS_BRANDED(zone))
5217 5252                  ZBROP(zone)->b_free_brand_data(zone);
5218 5253  
5219 5254          /* Say goodbye to brand framework. */
5220 5255          brand_unregister_zone(zone->zone_brand);
5221 5256  
5222 5257          /*
5223 5258           * It is now safe to let the zone be recreated; remove it from the
5224 5259           * lists.  The memory will not be freed until the last cred
5225 5260           * reference goes away.
5226 5261           */
5227 5262          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5228 5263          zonecount--;
5229 5264          /* remove from active list and hash tables */
5230 5265          list_remove(&zone_active, zone);
5231 5266          (void) mod_hash_destroy(zonehashbyname,
5232 5267              (mod_hash_key_t)zone->zone_name);
5233 5268          (void) mod_hash_destroy(zonehashbyid,
5234 5269              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5235 5270          if (zone->zone_flags & ZF_HASHED_LABEL)
5236 5271                  (void) mod_hash_destroy(zonehashbylabel,
5237 5272                      (mod_hash_key_t)zone->zone_slabel);
5238 5273          mutex_exit(&zonehash_lock);
5239 5274  
5240 5275          /*
5241 5276           * Release the root vnode; we're not using it anymore.  Nor should any
5242 5277           * other thread that might access it exist.
5243 5278           */
5244 5279          if (zone->zone_rootvp != NULL) {
5245 5280                  VN_RELE(zone->zone_rootvp);
5246 5281                  zone->zone_rootvp = NULL;
5247 5282          }
5248 5283  
5249 5284          /* add to deathrow list */
5250 5285          mutex_enter(&zone_deathrow_lock);
5251 5286          list_insert_tail(&zone_deathrow, zone);
5252 5287          mutex_exit(&zone_deathrow_lock);
5253 5288  
5254 5289          /*
5255 5290           * Drop last reference (which was added by zsched()), this will
5256 5291           * free the zone unless there are outstanding cred references.
5257 5292           */
5258 5293          zone_rele(zone);
5259 5294          return (0);
5260 5295  }
5261 5296  
5262 5297  /*
5263 5298   * Systemcall entry point for zone_getattr(2).
5264 5299   */
5265 5300  static ssize_t
5266 5301  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5267 5302  {
5268 5303          size_t size;
5269 5304          int error = 0, err;
5270 5305          zone_t *zone;
5271 5306          char *zonepath;
5272 5307          char *outstr;
5273 5308          zone_status_t zone_status;
5274 5309          pid_t initpid;
5275 5310          boolean_t global = (curzone == global_zone);
5276 5311          boolean_t inzone = (curzone->zone_id == zoneid);
5277 5312          ushort_t flags;
5278 5313          zone_net_data_t *zbuf;
5279 5314  
5280 5315          mutex_enter(&zonehash_lock);
5281 5316          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5282 5317                  mutex_exit(&zonehash_lock);
5283 5318                  return (set_errno(EINVAL));
5284 5319          }
5285 5320          zone_status = zone_status_get(zone);
5286 5321          if (zone_status < ZONE_IS_INITIALIZED) {
5287 5322                  mutex_exit(&zonehash_lock);
5288 5323                  return (set_errno(EINVAL));
5289 5324          }
5290 5325          zone_hold(zone);
5291 5326          mutex_exit(&zonehash_lock);
5292 5327  
5293 5328          /*
5294 5329           * If not in the global zone, don't show information about other zones,
5295 5330           * unless the system is labeled and the local zone's label dominates
5296 5331           * the other zone.
5297 5332           */
5298 5333          if (!zone_list_access(zone)) {
5299 5334                  zone_rele(zone);
5300 5335                  return (set_errno(EINVAL));
5301 5336          }
5302 5337  
5303 5338          switch (attr) {
5304 5339          case ZONE_ATTR_ROOT:
5305 5340                  if (global) {
5306 5341                          /*
5307 5342                           * Copy the path to trim the trailing "/" (except for
5308 5343                           * the global zone).
5309 5344                           */
5310 5345                          if (zone != global_zone)
5311 5346                                  size = zone->zone_rootpathlen - 1;
5312 5347                          else
5313 5348                                  size = zone->zone_rootpathlen;
5314 5349                          zonepath = kmem_alloc(size, KM_SLEEP);
5315 5350                          bcopy(zone->zone_rootpath, zonepath, size);
5316 5351                          zonepath[size - 1] = '\0';
5317 5352                  } else {
5318 5353                          if (inzone || !is_system_labeled()) {
5319 5354                                  /*
5320 5355                                   * Caller is not in the global zone.
5321 5356                                   * if the query is on the current zone
5322 5357                                   * or the system is not labeled,
5323 5358                                   * just return faked-up path for current zone.
5324 5359                                   */
5325 5360                                  zonepath = "/";
5326 5361                                  size = 2;
5327 5362                          } else {
5328 5363                                  /*
5329 5364                                   * Return related path for current zone.
5330 5365                                   */
5331 5366                                  int prefix_len = strlen(zone_prefix);
5332 5367                                  int zname_len = strlen(zone->zone_name);
5333 5368  
5334 5369                                  size = prefix_len + zname_len + 1;
5335 5370                                  zonepath = kmem_alloc(size, KM_SLEEP);
5336 5371                                  bcopy(zone_prefix, zonepath, prefix_len);
5337 5372                                  bcopy(zone->zone_name, zonepath +
5338 5373                                      prefix_len, zname_len);
5339 5374                                  zonepath[size - 1] = '\0';
5340 5375                          }
5341 5376                  }
5342 5377                  if (bufsize > size)
5343 5378                          bufsize = size;
5344 5379                  if (buf != NULL) {
5345 5380                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5346 5381                          if (err != 0 && err != ENAMETOOLONG)
5347 5382                                  error = EFAULT;
5348 5383                  }
5349 5384                  if (global || (is_system_labeled() && !inzone))
5350 5385                          kmem_free(zonepath, size);
5351 5386                  break;
5352 5387  
5353 5388          case ZONE_ATTR_NAME:
5354 5389                  size = strlen(zone->zone_name) + 1;
5355 5390                  if (bufsize > size)
5356 5391                          bufsize = size;
5357 5392                  if (buf != NULL) {
5358 5393                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5359 5394                          if (err != 0 && err != ENAMETOOLONG)
5360 5395                                  error = EFAULT;
5361 5396                  }
5362 5397                  break;
5363 5398  
5364 5399          case ZONE_ATTR_STATUS:
5365 5400                  /*
5366 5401                   * Since we're not holding zonehash_lock, the zone status
5367 5402                   * may be anything; leave it up to userland to sort it out.
5368 5403                   */
5369 5404                  size = sizeof (zone_status);
5370 5405                  if (bufsize > size)
5371 5406                          bufsize = size;
5372 5407                  zone_status = zone_status_get(zone);
5373 5408                  if (buf != NULL &&
5374 5409                      copyout(&zone_status, buf, bufsize) != 0)
5375 5410                          error = EFAULT;
5376 5411                  break;
5377 5412          case ZONE_ATTR_FLAGS:
5378 5413                  size = sizeof (zone->zone_flags);
5379 5414                  if (bufsize > size)
5380 5415                          bufsize = size;
5381 5416                  flags = zone->zone_flags;
5382 5417                  if (buf != NULL &&
5383 5418                      copyout(&flags, buf, bufsize) != 0)
5384 5419                          error = EFAULT;
5385 5420                  break;
5386 5421          case ZONE_ATTR_PRIVSET:
5387 5422                  size = sizeof (priv_set_t);
5388 5423                  if (bufsize > size)
5389 5424                          bufsize = size;
5390 5425                  if (buf != NULL &&
5391 5426                      copyout(zone->zone_privset, buf, bufsize) != 0)
5392 5427                          error = EFAULT;
5393 5428                  break;
5394 5429          case ZONE_ATTR_UNIQID:
5395 5430                  size = sizeof (zone->zone_uniqid);
5396 5431                  if (bufsize > size)
5397 5432                          bufsize = size;
5398 5433                  if (buf != NULL &&
5399 5434                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5400 5435                          error = EFAULT;
5401 5436                  break;
5402 5437          case ZONE_ATTR_POOLID:
5403 5438                  {
5404 5439                          pool_t *pool;
5405 5440                          poolid_t poolid;
5406 5441  
5407 5442                          if (pool_lock_intr() != 0) {
5408 5443                                  error = EINTR;
5409 5444                                  break;
5410 5445                          }
5411 5446                          pool = zone_pool_get(zone);
5412 5447                          poolid = pool->pool_id;
5413 5448                          pool_unlock();
5414 5449                          size = sizeof (poolid);
5415 5450                          if (bufsize > size)
5416 5451                                  bufsize = size;
5417 5452                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5418 5453                                  error = EFAULT;
5419 5454                  }
5420 5455                  break;
5421 5456          case ZONE_ATTR_SLBL:
5422 5457                  size = sizeof (bslabel_t);
5423 5458                  if (bufsize > size)
5424 5459                          bufsize = size;
5425 5460                  if (zone->zone_slabel == NULL)
5426 5461                          error = EINVAL;
5427 5462                  else if (buf != NULL &&
5428 5463                      copyout(label2bslabel(zone->zone_slabel), buf,
5429 5464                      bufsize) != 0)
5430 5465                          error = EFAULT;
5431 5466                  break;
5432 5467          case ZONE_ATTR_INITPID:
5433 5468                  size = sizeof (initpid);
5434 5469                  if (bufsize > size)
5435 5470                          bufsize = size;
5436 5471                  initpid = zone->zone_proc_initpid;
5437 5472                  if (initpid == -1) {
5438 5473                          error = ESRCH;
5439 5474                          break;
5440 5475                  }
5441 5476                  if (buf != NULL &&
5442 5477                      copyout(&initpid, buf, bufsize) != 0)
5443 5478                          error = EFAULT;
5444 5479                  break;
5445 5480          case ZONE_ATTR_BRAND:
5446 5481                  size = strlen(zone->zone_brand->b_name) + 1;
5447 5482  
5448 5483                  if (bufsize > size)
5449 5484                          bufsize = size;
5450 5485                  if (buf != NULL) {
5451 5486                          err = copyoutstr(zone->zone_brand->b_name, buf,
5452 5487                              bufsize, NULL);
5453 5488                          if (err != 0 && err != ENAMETOOLONG)
5454 5489                                  error = EFAULT;
5455 5490                  }
5456 5491                  break;
5457 5492          case ZONE_ATTR_INITNAME:
5458 5493                  size = strlen(zone->zone_initname) + 1;
5459 5494                  if (bufsize > size)
5460 5495                          bufsize = size;
5461 5496                  if (buf != NULL) {
5462 5497                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5463 5498                              NULL);
5464 5499                          if (err != 0 && err != ENAMETOOLONG)
5465 5500                                  error = EFAULT;
5466 5501                  }
5467 5502                  break;
5468 5503          case ZONE_ATTR_BOOTARGS:
5469 5504                  if (zone->zone_bootargs == NULL)
5470 5505                          outstr = "";
5471 5506                  else
5472 5507                          outstr = zone->zone_bootargs;
5473 5508                  size = strlen(outstr) + 1;
5474 5509                  if (bufsize > size)
5475 5510                          bufsize = size;
5476 5511                  if (buf != NULL) {
5477 5512                          err = copyoutstr(outstr, buf, bufsize, NULL);
5478 5513                          if (err != 0 && err != ENAMETOOLONG)
5479 5514                                  error = EFAULT;
5480 5515                  }
5481 5516                  break;
5482 5517          case ZONE_ATTR_PHYS_MCAP:
5483 5518                  size = sizeof (zone->zone_phys_mcap);
5484 5519                  if (bufsize > size)
5485 5520                          bufsize = size;
5486 5521                  if (buf != NULL &&
5487 5522                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5488 5523                          error = EFAULT;
5489 5524                  break;
5490 5525          case ZONE_ATTR_SCHED_CLASS:
5491 5526                  mutex_enter(&class_lock);
5492 5527  
5493 5528                  if (zone->zone_defaultcid >= loaded_classes)
5494 5529                          outstr = "";
5495 5530                  else
5496 5531                          outstr = sclass[zone->zone_defaultcid].cl_name;
5497 5532                  size = strlen(outstr) + 1;
5498 5533                  if (bufsize > size)
5499 5534                          bufsize = size;
5500 5535                  if (buf != NULL) {
5501 5536                          err = copyoutstr(outstr, buf, bufsize, NULL);
5502 5537                          if (err != 0 && err != ENAMETOOLONG)
5503 5538                                  error = EFAULT;
5504 5539                  }
5505 5540  
5506 5541                  mutex_exit(&class_lock);
5507 5542                  break;
5508 5543          case ZONE_ATTR_HOSTID:
5509 5544                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5510 5545                      bufsize == sizeof (zone->zone_hostid)) {
5511 5546                          size = sizeof (zone->zone_hostid);
5512 5547                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5513 5548                              bufsize) != 0)
5514 5549                                  error = EFAULT;
5515 5550                  } else {
5516 5551                          error = EINVAL;
5517 5552                  }
5518 5553                  break;
5519 5554          case ZONE_ATTR_FS_ALLOWED:
5520 5555                  if (zone->zone_fs_allowed == NULL)
5521 5556                          outstr = "";
5522 5557                  else

↓ open down ↓

1101 lines elided

↑ open up ↑

5523 5558                          outstr = zone->zone_fs_allowed;
5524 5559                  size = strlen(outstr) + 1;
5525 5560                  if (bufsize > size)
5526 5561                          bufsize = size;
5527 5562                  if (buf != NULL) {
5528 5563                          err = copyoutstr(outstr, buf, bufsize, NULL);
5529 5564                          if (err != 0 && err != ENAMETOOLONG)
5530 5565                                  error = EFAULT;
5531 5566                  }
5532 5567                  break;
     5568 +        case ZONE_ATTR_SECFLAGS:
     5569 +                size = sizeof (zone->zone_secflags);
     5570 +                if (bufsize > size)
     5571 +                        bufsize = size;
     5572 +                if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
     5573 +                        error = EFAULT;
     5574 +                break;
5533 5575          case ZONE_ATTR_NETWORK:
5534 5576                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5535 5577                  if (copyin(buf, zbuf, bufsize) != 0) {
5536 5578                          error = EFAULT;
5537 5579                  } else {
5538 5580                          error = zone_get_network(zoneid, zbuf);
5539 5581                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5540 5582                                  error = EFAULT;
5541 5583                  }
5542 5584                  kmem_free(zbuf, bufsize);

5543 5585                  break;
5544 5586          default:
5545 5587                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5546 5588                          size = bufsize;
5547 5589                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5548 5590                  } else {
5549 5591                          error = EINVAL;
5550 5592                  }
5551 5593          }
5552 5594          zone_rele(zone);
5553 5595  
5554 5596          if (error)
5555 5597                  return (set_errno(error));
5556 5598          return ((ssize_t)size);
5557 5599  }
5558 5600  
5559 5601  /*
5560 5602   * Systemcall entry point for zone_setattr(2).
5561 5603   */
5562 5604  /*ARGSUSED*/
5563 5605  static int
5564 5606  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5565 5607  {
5566 5608          zone_t *zone;
5567 5609          zone_status_t zone_status;
5568 5610          int err = -1;
5569 5611          zone_net_data_t *zbuf;
5570 5612  
5571 5613          if (secpolicy_zone_config(CRED()) != 0)
5572 5614                  return (set_errno(EPERM));
5573 5615  
5574 5616          /*
5575 5617           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5576 5618           * global zone.
5577 5619           */
5578 5620          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5579 5621                  return (set_errno(EINVAL));
5580 5622          }
5581 5623  
5582 5624          mutex_enter(&zonehash_lock);
5583 5625          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5584 5626                  mutex_exit(&zonehash_lock);
5585 5627                  return (set_errno(EINVAL));
5586 5628          }
5587 5629          zone_hold(zone);
5588 5630          mutex_exit(&zonehash_lock);
5589 5631  
5590 5632          /*
5591 5633           * At present most attributes can only be set on non-running,
5592 5634           * non-global zones.
5593 5635           */
5594 5636          zone_status = zone_status_get(zone);
5595 5637          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5596 5638                  err = EINVAL;
5597 5639                  goto done;
5598 5640          }
5599 5641  
5600 5642          switch (attr) {
5601 5643          case ZONE_ATTR_INITNAME:
5602 5644                  err = zone_set_initname(zone, (const char *)buf);
5603 5645                  break;
5604 5646          case ZONE_ATTR_INITNORESTART:
5605 5647                  zone->zone_restart_init = B_FALSE;
5606 5648                  err = 0;

↓ open down ↓

64 lines elided

↑ open up ↑

5607 5649                  break;
5608 5650          case ZONE_ATTR_BOOTARGS:
5609 5651                  err = zone_set_bootargs(zone, (const char *)buf);
5610 5652                  break;
5611 5653          case ZONE_ATTR_BRAND:
5612 5654                  err = zone_set_brand(zone, (const char *)buf);
5613 5655                  break;
5614 5656          case ZONE_ATTR_FS_ALLOWED:
5615 5657                  err = zone_set_fs_allowed(zone, (const char *)buf);
5616 5658                  break;
     5659 +        case ZONE_ATTR_SECFLAGS:
     5660 +                err = zone_set_secflags(zone, (psecflags_t *)buf);
     5661 +                break;
5617 5662          case ZONE_ATTR_PHYS_MCAP:
5618 5663                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5619 5664                  break;
5620 5665          case ZONE_ATTR_SCHED_CLASS:
5621 5666                  err = zone_set_sched_class(zone, (const char *)buf);
5622 5667                  break;
5623 5668          case ZONE_ATTR_HOSTID:
5624 5669                  if (bufsize == sizeof (zone->zone_hostid)) {
5625 5670                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5626 5671                                  err = 0;

5627 5672                          else
5628 5673                                  err = EFAULT;
5629 5674                  } else {
5630 5675                          err = EINVAL;
5631 5676                  }
5632 5677                  break;
5633 5678          case ZONE_ATTR_NETWORK:
5634 5679                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5635 5680                          err = EINVAL;
5636 5681                          break;
5637 5682                  }
5638 5683                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5639 5684                  if (copyin(buf, zbuf, bufsize) != 0) {
5640 5685                          kmem_free(zbuf, bufsize);
5641 5686                          err = EFAULT;
5642 5687                          break;
5643 5688                  }
5644 5689                  err = zone_set_network(zoneid, zbuf);
5645 5690                  kmem_free(zbuf, bufsize);
5646 5691                  break;
5647 5692          default:
5648 5693                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5649 5694                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5650 5695                  else
5651 5696                          err = EINVAL;
5652 5697          }
5653 5698  
5654 5699  done:
5655 5700          zone_rele(zone);
5656 5701          ASSERT(err != -1);
5657 5702          return (err != 0 ? set_errno(err) : 0);
5658 5703  }
5659 5704  
5660 5705  /*
5661 5706   * Return zero if the process has at least one vnode mapped in to its
5662 5707   * address space which shouldn't be allowed to change zones.
5663 5708   *
5664 5709   * Also return zero if the process has any shared mappings which reserve
5665 5710   * swap.  This is because the counting for zone.max-swap does not allow swap
5666 5711   * reservation to be shared between zones.  zone swap reservation is counted
5667 5712   * on zone->zone_max_swap.
5668 5713   */
5669 5714  static int
5670 5715  as_can_change_zones(void)
5671 5716  {
5672 5717          proc_t *pp = curproc;
5673 5718          struct seg *seg;
5674 5719          struct as *as = pp->p_as;
5675 5720          vnode_t *vp;
5676 5721          int allow = 1;
5677 5722  
5678 5723          ASSERT(pp->p_as != &kas);
5679 5724          AS_LOCK_ENTER(as, RW_READER);
5680 5725          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5681 5726  
5682 5727                  /*
5683 5728                   * Cannot enter zone with shared anon memory which
5684 5729                   * reserves swap.  See comment above.
5685 5730                   */
5686 5731                  if (seg_can_change_zones(seg) == B_FALSE) {
5687 5732                          allow = 0;
5688 5733                          break;
5689 5734                  }
5690 5735                  /*
5691 5736                   * if we can't get a backing vnode for this segment then skip
5692 5737                   * it.
5693 5738                   */
5694 5739                  vp = NULL;
5695 5740                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5696 5741                          continue;
5697 5742                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5698 5743                          allow = 0;
5699 5744                          break;
5700 5745                  }
5701 5746          }
5702 5747          AS_LOCK_EXIT(as);
5703 5748          return (allow);
5704 5749  }
5705 5750  
5706 5751  /*
5707 5752   * Count swap reserved by curproc's address space
5708 5753   */
5709 5754  static size_t
5710 5755  as_swresv(void)
5711 5756  {
5712 5757          proc_t *pp = curproc;
5713 5758          struct seg *seg;
5714 5759          struct as *as = pp->p_as;
5715 5760          size_t swap = 0;
5716 5761  
5717 5762          ASSERT(pp->p_as != &kas);
5718 5763          ASSERT(AS_WRITE_HELD(as));
5719 5764          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5720 5765                  swap += seg_swresv(seg);
5721 5766  
5722 5767          return (swap);
5723 5768  }
5724 5769  
5725 5770  /*
5726 5771   * Systemcall entry point for zone_enter().
5727 5772   *
5728 5773   * The current process is injected into said zone.  In the process
5729 5774   * it will change its project membership, privileges, rootdir/cwd,
5730 5775   * zone-wide rctls, and pool association to match those of the zone.
5731 5776   *
5732 5777   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5733 5778   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5734 5779   * enter a zone that is "ready" or "running".
5735 5780   */
5736 5781  static int
5737 5782  zone_enter(zoneid_t zoneid)
5738 5783  {
5739 5784          zone_t *zone;
5740 5785          vnode_t *vp;
5741 5786          proc_t *pp = curproc;
5742 5787          contract_t *ct;
5743 5788          cont_process_t *ctp;
5744 5789          task_t *tk, *oldtk;
5745 5790          kproject_t *zone_proj0;
5746 5791          cred_t *cr, *newcr;
5747 5792          pool_t *oldpool, *newpool;
5748 5793          sess_t *sp;
5749 5794          uid_t uid;
5750 5795          zone_status_t status;
5751 5796          int err = 0;
5752 5797          rctl_entity_p_t e;
5753 5798          size_t swap;
5754 5799          kthread_id_t t;
5755 5800  
5756 5801          if (secpolicy_zone_config(CRED()) != 0)
5757 5802                  return (set_errno(EPERM));
5758 5803          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5759 5804                  return (set_errno(EINVAL));
5760 5805  
5761 5806          /*
5762 5807           * Stop all lwps so we don't need to hold a lock to look at
5763 5808           * curproc->p_zone.  This needs to happen before we grab any
5764 5809           * locks to avoid deadlock (another lwp in the process could
5765 5810           * be waiting for the held lock).
5766 5811           */
5767 5812          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5768 5813                  return (set_errno(EINTR));
5769 5814  
5770 5815          /*
5771 5816           * Make sure we're not changing zones with files open or mapped in
5772 5817           * to our address space which shouldn't be changing zones.
5773 5818           */
5774 5819          if (!files_can_change_zones()) {
5775 5820                  err = EBADF;
5776 5821                  goto out;
5777 5822          }
5778 5823          if (!as_can_change_zones()) {
5779 5824                  err = EFAULT;
5780 5825                  goto out;
5781 5826          }
5782 5827  
5783 5828          mutex_enter(&zonehash_lock);
5784 5829          if (pp->p_zone != global_zone) {
5785 5830                  mutex_exit(&zonehash_lock);
5786 5831                  err = EINVAL;
5787 5832                  goto out;
5788 5833          }
5789 5834  
5790 5835          zone = zone_find_all_by_id(zoneid);
5791 5836          if (zone == NULL) {
5792 5837                  mutex_exit(&zonehash_lock);
5793 5838                  err = EINVAL;
5794 5839                  goto out;
5795 5840          }
5796 5841  
5797 5842          /*
5798 5843           * To prevent processes in a zone from holding contracts on
5799 5844           * extrazonal resources, and to avoid process contract
5800 5845           * memberships which span zones, contract holders and processes
5801 5846           * which aren't the sole members of their encapsulating process
5802 5847           * contracts are not allowed to zone_enter.
5803 5848           */
5804 5849          ctp = pp->p_ct_process;
5805 5850          ct = &ctp->conp_contract;
5806 5851          mutex_enter(&ct->ct_lock);
5807 5852          mutex_enter(&pp->p_lock);
5808 5853          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5809 5854                  mutex_exit(&pp->p_lock);
5810 5855                  mutex_exit(&ct->ct_lock);
5811 5856                  mutex_exit(&zonehash_lock);
5812 5857                  err = EINVAL;
5813 5858                  goto out;
5814 5859          }
5815 5860  
5816 5861          /*
5817 5862           * Moreover, we don't allow processes whose encapsulating
5818 5863           * process contracts have inherited extrazonal contracts.
5819 5864           * While it would be easier to eliminate all process contracts
5820 5865           * with inherited contracts, we need to be able to give a
5821 5866           * restarted init (or other zone-penetrating process) its
5822 5867           * predecessor's contracts.
5823 5868           */
5824 5869          if (ctp->conp_ninherited != 0) {
5825 5870                  contract_t *next;
5826 5871                  for (next = list_head(&ctp->conp_inherited); next;
5827 5872                      next = list_next(&ctp->conp_inherited, next)) {
5828 5873                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5829 5874                                  mutex_exit(&pp->p_lock);
5830 5875                                  mutex_exit(&ct->ct_lock);
5831 5876                                  mutex_exit(&zonehash_lock);
5832 5877                                  err = EINVAL;
5833 5878                                  goto out;
5834 5879                          }
5835 5880                  }
5836 5881          }
5837 5882  
5838 5883          mutex_exit(&pp->p_lock);
5839 5884          mutex_exit(&ct->ct_lock);
5840 5885  
5841 5886          status = zone_status_get(zone);
5842 5887          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5843 5888                  /*
5844 5889                   * Can't join
5845 5890                   */
5846 5891                  mutex_exit(&zonehash_lock);
5847 5892                  err = EINVAL;
5848 5893                  goto out;
5849 5894          }
5850 5895  
5851 5896          /*
5852 5897           * Make sure new priv set is within the permitted set for caller
5853 5898           */
5854 5899          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5855 5900                  mutex_exit(&zonehash_lock);
5856 5901                  err = EPERM;
5857 5902                  goto out;
5858 5903          }
5859 5904          /*
5860 5905           * We want to momentarily drop zonehash_lock while we optimistically
5861 5906           * bind curproc to the pool it should be running in.  This is safe
5862 5907           * since the zone can't disappear (we have a hold on it).
5863 5908           */
5864 5909          zone_hold(zone);
5865 5910          mutex_exit(&zonehash_lock);
5866 5911  
5867 5912          /*
5868 5913           * Grab pool_lock to keep the pools configuration from changing
5869 5914           * and to stop ourselves from getting rebound to another pool
5870 5915           * until we join the zone.
5871 5916           */
5872 5917          if (pool_lock_intr() != 0) {
5873 5918                  zone_rele(zone);
5874 5919                  err = EINTR;
5875 5920                  goto out;
5876 5921          }
5877 5922          ASSERT(secpolicy_pool(CRED()) == 0);
5878 5923          /*
5879 5924           * Bind ourselves to the pool currently associated with the zone.
5880 5925           */
5881 5926          oldpool = curproc->p_pool;
5882 5927          newpool = zone_pool_get(zone);
5883 5928          if (pool_state == POOL_ENABLED && newpool != oldpool &&
5884 5929              (err = pool_do_bind(newpool, P_PID, P_MYID,
5885 5930              POOL_BIND_ALL)) != 0) {
5886 5931                  pool_unlock();
5887 5932                  zone_rele(zone);
5888 5933                  goto out;
5889 5934          }
5890 5935  
5891 5936          /*
5892 5937           * Grab cpu_lock now; we'll need it later when we call
5893 5938           * task_join().
5894 5939           */
5895 5940          mutex_enter(&cpu_lock);
5896 5941          mutex_enter(&zonehash_lock);
5897 5942          /*
5898 5943           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5899 5944           */
5900 5945          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5901 5946                  /*
5902 5947                   * Can't join anymore.
5903 5948                   */
5904 5949                  mutex_exit(&zonehash_lock);
5905 5950                  mutex_exit(&cpu_lock);
5906 5951                  if (pool_state == POOL_ENABLED &&
5907 5952                      newpool != oldpool)
5908 5953                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
5909 5954                              POOL_BIND_ALL);
5910 5955                  pool_unlock();
5911 5956                  zone_rele(zone);
5912 5957                  err = EINVAL;
5913 5958                  goto out;
5914 5959          }
5915 5960  
5916 5961          /*
5917 5962           * a_lock must be held while transfering locked memory and swap
5918 5963           * reservation from the global zone to the non global zone because
5919 5964           * asynchronous faults on the processes' address space can lock
5920 5965           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5921 5966           * segments respectively.
5922 5967           */
5923 5968          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
5924 5969          swap = as_swresv();
5925 5970          mutex_enter(&pp->p_lock);
5926 5971          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5927 5972          /* verify that we do not exceed and task or lwp limits */
5928 5973          mutex_enter(&zone->zone_nlwps_lock);
5929 5974          /* add new lwps to zone and zone's proj0 */
5930 5975          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5931 5976          zone->zone_nlwps += pp->p_lwpcnt;
5932 5977          /* add 1 task to zone's proj0 */
5933 5978          zone_proj0->kpj_ntasks += 1;
5934 5979  
5935 5980          zone_proj0->kpj_nprocs++;
5936 5981          zone->zone_nprocs++;
5937 5982          mutex_exit(&zone->zone_nlwps_lock);
5938 5983  
5939 5984          mutex_enter(&zone->zone_mem_lock);
5940 5985          zone->zone_locked_mem += pp->p_locked_mem;
5941 5986          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5942 5987          zone->zone_max_swap += swap;
5943 5988          mutex_exit(&zone->zone_mem_lock);
5944 5989  
5945 5990          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5946 5991          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5947 5992          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5948 5993  
5949 5994          /* remove lwps and process from proc's old zone and old project */
5950 5995          mutex_enter(&pp->p_zone->zone_nlwps_lock);
5951 5996          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5952 5997          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5953 5998          pp->p_task->tk_proj->kpj_nprocs--;
5954 5999          pp->p_zone->zone_nprocs--;
5955 6000          mutex_exit(&pp->p_zone->zone_nlwps_lock);
5956 6001  
5957 6002          mutex_enter(&pp->p_zone->zone_mem_lock);
5958 6003          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5959 6004          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5960 6005          pp->p_zone->zone_max_swap -= swap;
5961 6006          mutex_exit(&pp->p_zone->zone_mem_lock);
5962 6007  
5963 6008          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5964 6009          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5965 6010          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5966 6011  
5967 6012          pp->p_flag |= SZONETOP;
5968 6013          pp->p_zone = zone;
5969 6014          mutex_exit(&pp->p_lock);
5970 6015          AS_LOCK_EXIT(pp->p_as);
5971 6016  
5972 6017          /*
5973 6018           * Joining the zone cannot fail from now on.
5974 6019           *
5975 6020           * This means that a lot of the following code can be commonized and
5976 6021           * shared with zsched().
5977 6022           */
5978 6023  
5979 6024          /*
5980 6025           * If the process contract fmri was inherited, we need to
5981 6026           * flag this so that any contract status will not leak
5982 6027           * extra zone information, svc_fmri in this case
5983 6028           */
5984 6029          if (ctp->conp_svc_ctid != ct->ct_id) {
5985 6030                  mutex_enter(&ct->ct_lock);
5986 6031                  ctp->conp_svc_zone_enter = ct->ct_id;
5987 6032                  mutex_exit(&ct->ct_lock);
5988 6033          }
5989 6034  
5990 6035          /*
5991 6036           * Reset the encapsulating process contract's zone.
5992 6037           */
5993 6038          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5994 6039          contract_setzuniqid(ct, zone->zone_uniqid);
5995 6040  
5996 6041          /*
5997 6042           * Create a new task and associate the process with the project keyed
5998 6043           * by (projid,zoneid).
5999 6044           *
6000 6045           * We might as well be in project 0; the global zone's projid doesn't
6001 6046           * make much sense in a zone anyhow.
6002 6047           *
6003 6048           * This also increments zone_ntasks, and returns with p_lock held.
6004 6049           */
6005 6050          tk = task_create(0, zone);
6006 6051          oldtk = task_join(tk, 0);
6007 6052          mutex_exit(&cpu_lock);
6008 6053  
6009 6054          /*
6010 6055           * call RCTLOP_SET functions on this proc
6011 6056           */
6012 6057          e.rcep_p.zone = zone;
6013 6058          e.rcep_t = RCENTITY_ZONE;
6014 6059          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6015 6060              RCD_CALLBACK);
6016 6061          mutex_exit(&pp->p_lock);
6017 6062  
6018 6063          /*
6019 6064           * We don't need to hold any of zsched's locks here; not only do we know
6020 6065           * the process and zone aren't going away, we know its session isn't
6021 6066           * changing either.
6022 6067           *
6023 6068           * By joining zsched's session here, we mimic the behavior in the
6024 6069           * global zone of init's sid being the pid of sched.  We extend this
6025 6070           * to all zlogin-like zone_enter()'ing processes as well.
6026 6071           */
6027 6072          mutex_enter(&pidlock);
6028 6073          sp = zone->zone_zsched->p_sessp;
6029 6074          sess_hold(zone->zone_zsched);
6030 6075          mutex_enter(&pp->p_lock);
6031 6076          pgexit(pp);
6032 6077          sess_rele(pp->p_sessp, B_TRUE);
6033 6078          pp->p_sessp = sp;
6034 6079          pgjoin(pp, zone->zone_zsched->p_pidp);
6035 6080  
6036 6081          /*
6037 6082           * If any threads are scheduled to be placed on zone wait queue they
6038 6083           * should abandon the idea since the wait queue is changing.
6039 6084           * We need to be holding pidlock & p_lock to do this.
6040 6085           */
6041 6086          if ((t = pp->p_tlist) != NULL) {
6042 6087                  do {
6043 6088                          thread_lock(t);
6044 6089                          /*
6045 6090                           * Kick this thread so that he doesn't sit
6046 6091                           * on a wrong wait queue.
6047 6092                           */
6048 6093                          if (ISWAITING(t))
6049 6094                                  setrun_locked(t);
6050 6095  
6051 6096                          if (t->t_schedflag & TS_ANYWAITQ)
6052 6097                                  t->t_schedflag &= ~ TS_ANYWAITQ;
6053 6098  
6054 6099                          thread_unlock(t);
6055 6100                  } while ((t = t->t_forw) != pp->p_tlist);
6056 6101          }
6057 6102  
6058 6103          /*
6059 6104           * If there is a default scheduling class for the zone and it is not
6060 6105           * the class we are currently in, change all of the threads in the
6061 6106           * process to the new class.  We need to be holding pidlock & p_lock
6062 6107           * when we call parmsset so this is a good place to do it.
6063 6108           */
6064 6109          if (zone->zone_defaultcid > 0 &&
6065 6110              zone->zone_defaultcid != curthread->t_cid) {
6066 6111                  pcparms_t pcparms;
6067 6112  
6068 6113                  pcparms.pc_cid = zone->zone_defaultcid;
6069 6114                  pcparms.pc_clparms[0] = 0;
6070 6115  
6071 6116                  /*
6072 6117                   * If setting the class fails, we still want to enter the zone.
6073 6118                   */
6074 6119                  if ((t = pp->p_tlist) != NULL) {
6075 6120                          do {
6076 6121                                  (void) parmsset(&pcparms, t);
6077 6122                          } while ((t = t->t_forw) != pp->p_tlist);
6078 6123                  }
6079 6124          }
6080 6125  
6081 6126          mutex_exit(&pp->p_lock);
6082 6127          mutex_exit(&pidlock);
6083 6128  
6084 6129          mutex_exit(&zonehash_lock);
6085 6130          /*
6086 6131           * We're firmly in the zone; let pools progress.
6087 6132           */
6088 6133          pool_unlock();
6089 6134          task_rele(oldtk);
6090 6135          /*
6091 6136           * We don't need to retain a hold on the zone since we already
6092 6137           * incremented zone_ntasks, so the zone isn't going anywhere.
6093 6138           */
6094 6139          zone_rele(zone);
6095 6140  
6096 6141          /*
6097 6142           * Chroot
6098 6143           */
6099 6144          vp = zone->zone_rootvp;
6100 6145          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6101 6146          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6102 6147  
6103 6148          /*
6104 6149           * Change process credentials
6105 6150           */
6106 6151          newcr = cralloc();
6107 6152          mutex_enter(&pp->p_crlock);
6108 6153          cr = pp->p_cred;
6109 6154          crcopy_to(cr, newcr);
6110 6155          crsetzone(newcr, zone);
6111 6156          pp->p_cred = newcr;
6112 6157  
6113 6158          /*
6114 6159           * Restrict all process privilege sets to zone limit
6115 6160           */
6116 6161          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6117 6162          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6118 6163          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6119 6164          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6120 6165          mutex_exit(&pp->p_crlock);
6121 6166          crset(pp, newcr);
6122 6167  
6123 6168          /*
6124 6169           * Adjust upcount to reflect zone entry.
6125 6170           */
6126 6171          uid = crgetruid(newcr);
6127 6172          mutex_enter(&pidlock);
6128 6173          upcount_dec(uid, GLOBAL_ZONEID);
6129 6174          upcount_inc(uid, zoneid);
6130 6175          mutex_exit(&pidlock);
6131 6176  
6132 6177          /*
6133 6178           * Set up core file path and content.
6134 6179           */
6135 6180          set_core_defaults();
6136 6181  
6137 6182  out:
6138 6183          /*
6139 6184           * Let the other lwps continue.
6140 6185           */
6141 6186          mutex_enter(&pp->p_lock);
6142 6187          if (curthread != pp->p_agenttp)
6143 6188                  continuelwps(pp);
6144 6189          mutex_exit(&pp->p_lock);
6145 6190  
6146 6191          return (err != 0 ? set_errno(err) : 0);
6147 6192  }
6148 6193  
6149 6194  /*
6150 6195   * Systemcall entry point for zone_list(2).
6151 6196   *
6152 6197   * Processes running in a (non-global) zone only see themselves.
6153 6198   * On labeled systems, they see all zones whose label they dominate.
6154 6199   */
6155 6200  static int
6156 6201  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6157 6202  {
6158 6203          zoneid_t *zoneids;
6159 6204          zone_t *zone, *myzone;
6160 6205          uint_t user_nzones, real_nzones;
6161 6206          uint_t domi_nzones;
6162 6207          int error;
6163 6208  
6164 6209          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6165 6210                  return (set_errno(EFAULT));
6166 6211  
6167 6212          myzone = curproc->p_zone;
6168 6213          if (myzone != global_zone) {
6169 6214                  bslabel_t *mybslab;
6170 6215  
6171 6216                  if (!is_system_labeled()) {
6172 6217                          /* just return current zone */
6173 6218                          real_nzones = domi_nzones = 1;
6174 6219                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6175 6220                          zoneids[0] = myzone->zone_id;
6176 6221                  } else {
6177 6222                          /* return all zones that are dominated */
6178 6223                          mutex_enter(&zonehash_lock);
6179 6224                          real_nzones = zonecount;
6180 6225                          domi_nzones = 0;
6181 6226                          if (real_nzones > 0) {
6182 6227                                  zoneids = kmem_alloc(real_nzones *
6183 6228                                      sizeof (zoneid_t), KM_SLEEP);
6184 6229                                  mybslab = label2bslabel(myzone->zone_slabel);
6185 6230                                  for (zone = list_head(&zone_active);
6186 6231                                      zone != NULL;
6187 6232                                      zone = list_next(&zone_active, zone)) {
6188 6233                                          if (zone->zone_id == GLOBAL_ZONEID)
6189 6234                                                  continue;
6190 6235                                          if (zone != myzone &&
6191 6236                                              (zone->zone_flags & ZF_IS_SCRATCH))
6192 6237                                                  continue;
6193 6238                                          /*
6194 6239                                           * Note that a label always dominates
6195 6240                                           * itself, so myzone is always included
6196 6241                                           * in the list.
6197 6242                                           */
6198 6243                                          if (bldominates(mybslab,
6199 6244                                              label2bslabel(zone->zone_slabel))) {
6200 6245                                                  zoneids[domi_nzones++] =
6201 6246                                                      zone->zone_id;
6202 6247                                          }
6203 6248                                  }
6204 6249                          }
6205 6250                          mutex_exit(&zonehash_lock);
6206 6251                  }
6207 6252          } else {
6208 6253                  mutex_enter(&zonehash_lock);
6209 6254                  real_nzones = zonecount;
6210 6255                  domi_nzones = 0;
6211 6256                  if (real_nzones > 0) {
6212 6257                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6213 6258                              KM_SLEEP);
6214 6259                          for (zone = list_head(&zone_active); zone != NULL;
6215 6260                              zone = list_next(&zone_active, zone))
6216 6261                                  zoneids[domi_nzones++] = zone->zone_id;
6217 6262                          ASSERT(domi_nzones == real_nzones);
6218 6263                  }
6219 6264                  mutex_exit(&zonehash_lock);
6220 6265          }
6221 6266  
6222 6267          /*
6223 6268           * If user has allocated space for fewer entries than we found, then
6224 6269           * return only up to his limit.  Either way, tell him exactly how many
6225 6270           * we found.
6226 6271           */
6227 6272          if (domi_nzones < user_nzones)
6228 6273                  user_nzones = domi_nzones;
6229 6274          error = 0;
6230 6275          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6231 6276                  error = EFAULT;
6232 6277          } else if (zoneidlist != NULL && user_nzones != 0) {
6233 6278                  if (copyout(zoneids, zoneidlist,
6234 6279                      user_nzones * sizeof (zoneid_t)) != 0)
6235 6280                          error = EFAULT;
6236 6281          }
6237 6282  
6238 6283          if (real_nzones > 0)
6239 6284                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6240 6285  
6241 6286          if (error != 0)
6242 6287                  return (set_errno(error));
6243 6288          else
6244 6289                  return (0);
6245 6290  }
6246 6291  
6247 6292  /*
6248 6293   * Systemcall entry point for zone_lookup(2).
6249 6294   *
6250 6295   * Non-global zones are only able to see themselves and (on labeled systems)
6251 6296   * the zones they dominate.
6252 6297   */
6253 6298  static zoneid_t
6254 6299  zone_lookup(const char *zone_name)
6255 6300  {
6256 6301          char *kname;
6257 6302          zone_t *zone;
6258 6303          zoneid_t zoneid;
6259 6304          int err;
6260 6305  
6261 6306          if (zone_name == NULL) {
6262 6307                  /* return caller's zone id */
6263 6308                  return (getzoneid());
6264 6309          }
6265 6310  
6266 6311          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6267 6312          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6268 6313                  kmem_free(kname, ZONENAME_MAX);
6269 6314                  return (set_errno(err));
6270 6315          }
6271 6316  
6272 6317          mutex_enter(&zonehash_lock);
6273 6318          zone = zone_find_all_by_name(kname);
6274 6319          kmem_free(kname, ZONENAME_MAX);
6275 6320          /*
6276 6321           * In a non-global zone, can only lookup global and own name.
6277 6322           * In Trusted Extensions zone label dominance rules apply.
6278 6323           */
6279 6324          if (zone == NULL ||
6280 6325              zone_status_get(zone) < ZONE_IS_READY ||
6281 6326              !zone_list_access(zone)) {
6282 6327                  mutex_exit(&zonehash_lock);
6283 6328                  return (set_errno(EINVAL));
6284 6329          } else {
6285 6330                  zoneid = zone->zone_id;
6286 6331                  mutex_exit(&zonehash_lock);
6287 6332                  return (zoneid);
6288 6333          }
6289 6334  }
6290 6335  
6291 6336  static int
6292 6337  zone_version(int *version_arg)
6293 6338  {
6294 6339          int version = ZONE_SYSCALL_API_VERSION;
6295 6340  
6296 6341          if (copyout(&version, version_arg, sizeof (int)) != 0)
6297 6342                  return (set_errno(EFAULT));
6298 6343          return (0);
6299 6344  }
6300 6345  
6301 6346  /* ARGSUSED */
6302 6347  long
6303 6348  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6304 6349  {
6305 6350          zone_def zs;
6306 6351          int err;
6307 6352  
6308 6353          switch (cmd) {
6309 6354          case ZONE_CREATE:
6310 6355                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6311 6356                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6312 6357                                  return (set_errno(EFAULT));
6313 6358                          }
6314 6359                  } else {
6315 6360  #ifdef _SYSCALL32_IMPL
6316 6361                          zone_def32 zs32;
6317 6362  
6318 6363                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6319 6364                                  return (set_errno(EFAULT));
6320 6365                          }
6321 6366                          zs.zone_name =
6322 6367                              (const char *)(unsigned long)zs32.zone_name;
6323 6368                          zs.zone_root =
6324 6369                              (const char *)(unsigned long)zs32.zone_root;
6325 6370                          zs.zone_privs =
6326 6371                              (const struct priv_set *)
6327 6372                              (unsigned long)zs32.zone_privs;
6328 6373                          zs.zone_privssz = zs32.zone_privssz;
6329 6374                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6330 6375                          zs.rctlbufsz = zs32.rctlbufsz;
6331 6376                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6332 6377                          zs.zfsbufsz = zs32.zfsbufsz;
6333 6378                          zs.extended_error =
6334 6379                              (int *)(unsigned long)zs32.extended_error;
6335 6380                          zs.match = zs32.match;
6336 6381                          zs.doi = zs32.doi;
6337 6382                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6338 6383                          zs.flags = zs32.flags;
6339 6384  #else
6340 6385                          panic("get_udatamodel() returned bogus result\n");
6341 6386  #endif
6342 6387                  }
6343 6388  
6344 6389                  return (zone_create(zs.zone_name, zs.zone_root,
6345 6390                      zs.zone_privs, zs.zone_privssz,
6346 6391                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6347 6392                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6348 6393                      zs.extended_error, zs.match, zs.doi,
6349 6394                      zs.label, zs.flags));
6350 6395          case ZONE_BOOT:
6351 6396                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6352 6397          case ZONE_DESTROY:
6353 6398                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6354 6399          case ZONE_GETATTR:
6355 6400                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6356 6401                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6357 6402          case ZONE_SETATTR:
6358 6403                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6359 6404                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6360 6405          case ZONE_ENTER:
6361 6406                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6362 6407          case ZONE_LIST:
6363 6408                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6364 6409          case ZONE_SHUTDOWN:
6365 6410                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6366 6411          case ZONE_LOOKUP:
6367 6412                  return (zone_lookup((const char *)arg1));
6368 6413          case ZONE_VERSION:
6369 6414                  return (zone_version((int *)arg1));
6370 6415          case ZONE_ADD_DATALINK:
6371 6416                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6372 6417                      (datalink_id_t)(uintptr_t)arg2));
6373 6418          case ZONE_DEL_DATALINK:
6374 6419                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6375 6420                      (datalink_id_t)(uintptr_t)arg2));
6376 6421          case ZONE_CHECK_DATALINK: {
6377 6422                  zoneid_t        zoneid;
6378 6423                  boolean_t       need_copyout;
6379 6424  
6380 6425                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6381 6426                          return (EFAULT);
6382 6427                  need_copyout = (zoneid == ALL_ZONES);
6383 6428                  err = zone_check_datalink(&zoneid,
6384 6429                      (datalink_id_t)(uintptr_t)arg2);
6385 6430                  if (err == 0 && need_copyout) {
6386 6431                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6387 6432                                  err = EFAULT;
6388 6433                  }
6389 6434                  return (err == 0 ? 0 : set_errno(err));
6390 6435          }
6391 6436          case ZONE_LIST_DATALINK:
6392 6437                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6393 6438                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6394 6439          default:
6395 6440                  return (set_errno(EINVAL));
6396 6441          }
6397 6442  }
6398 6443  
6399 6444  struct zarg {
6400 6445          zone_t *zone;
6401 6446          zone_cmd_arg_t arg;
6402 6447  };
6403 6448  
6404 6449  static int
6405 6450  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6406 6451  {
6407 6452          char *buf;
6408 6453          size_t buflen;
6409 6454          int error;
6410 6455  
6411 6456          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6412 6457          buf = kmem_alloc(buflen, KM_SLEEP);
6413 6458          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6414 6459          error = door_ki_open(buf, doorp);
6415 6460          kmem_free(buf, buflen);
6416 6461          return (error);
6417 6462  }
6418 6463  
6419 6464  static void
6420 6465  zone_release_door(door_handle_t *doorp)
6421 6466  {
6422 6467          door_ki_rele(*doorp);
6423 6468          *doorp = NULL;
6424 6469  }
6425 6470  
6426 6471  static void
6427 6472  zone_ki_call_zoneadmd(struct zarg *zargp)
6428 6473  {
6429 6474          door_handle_t door = NULL;
6430 6475          door_arg_t darg, save_arg;
6431 6476          char *zone_name;
6432 6477          size_t zone_namelen;
6433 6478          zoneid_t zoneid;
6434 6479          zone_t *zone;
6435 6480          zone_cmd_arg_t arg;
6436 6481          uint64_t uniqid;
6437 6482          size_t size;
6438 6483          int error;
6439 6484          int retry;
6440 6485  
6441 6486          zone = zargp->zone;
6442 6487          arg = zargp->arg;
6443 6488          kmem_free(zargp, sizeof (*zargp));
6444 6489  
6445 6490          zone_namelen = strlen(zone->zone_name) + 1;
6446 6491          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6447 6492          bcopy(zone->zone_name, zone_name, zone_namelen);
6448 6493          zoneid = zone->zone_id;
6449 6494          uniqid = zone->zone_uniqid;
6450 6495          /*
6451 6496           * zoneadmd may be down, but at least we can empty out the zone.
6452 6497           * We can ignore the return value of zone_empty() since we're called
6453 6498           * from a kernel thread and know we won't be delivered any signals.
6454 6499           */
6455 6500          ASSERT(curproc == &p0);
6456 6501          (void) zone_empty(zone);
6457 6502          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6458 6503          zone_rele(zone);
6459 6504  
6460 6505          size = sizeof (arg);
6461 6506          darg.rbuf = (char *)&arg;
6462 6507          darg.data_ptr = (char *)&arg;
6463 6508          darg.rsize = size;
6464 6509          darg.data_size = size;
6465 6510          darg.desc_ptr = NULL;
6466 6511          darg.desc_num = 0;
6467 6512  
6468 6513          save_arg = darg;
6469 6514          /*
6470 6515           * Since we're not holding a reference to the zone, any number of
6471 6516           * things can go wrong, including the zone disappearing before we get a
6472 6517           * chance to talk to zoneadmd.
6473 6518           */
6474 6519          for (retry = 0; /* forever */; retry++) {
6475 6520                  if (door == NULL &&
6476 6521                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6477 6522                          goto next;
6478 6523                  }
6479 6524                  ASSERT(door != NULL);
6480 6525  
6481 6526                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6482 6527                      SIZE_MAX, 0)) == 0) {
6483 6528                          break;
6484 6529                  }
6485 6530                  switch (error) {
6486 6531                  case EINTR:
6487 6532                          /* FALLTHROUGH */
6488 6533                  case EAGAIN:    /* process may be forking */
6489 6534                          /*
6490 6535                           * Back off for a bit
6491 6536                           */
6492 6537                          break;
6493 6538                  case EBADF:
6494 6539                          zone_release_door(&door);
6495 6540                          if (zone_lookup_door(zone_name, &door) != 0) {
6496 6541                                  /*
6497 6542                                   * zoneadmd may be dead, but it may come back to
6498 6543                                   * life later.
6499 6544                                   */
6500 6545                                  break;
6501 6546                          }
6502 6547                          break;
6503 6548                  default:
6504 6549                          cmn_err(CE_WARN,
6505 6550                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6506 6551                              error);
6507 6552                          goto out;
6508 6553                  }
6509 6554  next:
6510 6555                  /*
6511 6556                   * If this isn't the same zone_t that we originally had in mind,
6512 6557                   * then this is the same as if two kadmin requests come in at
6513 6558                   * the same time: the first one wins.  This means we lose, so we
6514 6559                   * bail.
6515 6560                   */
6516 6561                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6517 6562                          /*
6518 6563                           * Problem is solved.
6519 6564                           */
6520 6565                          break;
6521 6566                  }
6522 6567                  if (zone->zone_uniqid != uniqid) {
6523 6568                          /*
6524 6569                           * zoneid recycled
6525 6570                           */
6526 6571                          zone_rele(zone);
6527 6572                          break;
6528 6573                  }
6529 6574                  /*
6530 6575                   * We could zone_status_timedwait(), but there doesn't seem to
6531 6576                   * be much point in doing that (plus, it would mean that
6532 6577                   * zone_free() isn't called until this thread exits).
6533 6578                   */
6534 6579                  zone_rele(zone);
6535 6580                  delay(hz);
6536 6581                  darg = save_arg;
6537 6582          }
6538 6583  out:
6539 6584          if (door != NULL) {
6540 6585                  zone_release_door(&door);
6541 6586          }
6542 6587          kmem_free(zone_name, zone_namelen);
6543 6588          thread_exit();
6544 6589  }
6545 6590  
6546 6591  /*
6547 6592   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6548 6593   * kadmin().  The caller is a process in the zone.
6549 6594   *
6550 6595   * In order to shutdown the zone, we will hand off control to zoneadmd
6551 6596   * (running in the global zone) via a door.  We do a half-hearted job at
6552 6597   * killing all processes in the zone, create a kernel thread to contact
6553 6598   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6554 6599   * a form of generation number used to let zoneadmd (as well as
6555 6600   * zone_destroy()) know exactly which zone they're re talking about.
6556 6601   */
6557 6602  int
6558 6603  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6559 6604  {
6560 6605          struct zarg *zargp;
6561 6606          zone_cmd_t zcmd;
6562 6607          zone_t *zone;
6563 6608  
6564 6609          zone = curproc->p_zone;
6565 6610          ASSERT(getzoneid() != GLOBAL_ZONEID);
6566 6611  
6567 6612          switch (cmd) {
6568 6613          case A_SHUTDOWN:
6569 6614                  switch (fcn) {
6570 6615                  case AD_HALT:
6571 6616                  case AD_POWEROFF:
6572 6617                          zcmd = Z_HALT;
6573 6618                          break;
6574 6619                  case AD_BOOT:
6575 6620                          zcmd = Z_REBOOT;
6576 6621                          break;
6577 6622                  case AD_IBOOT:
6578 6623                  case AD_SBOOT:
6579 6624                  case AD_SIBOOT:
6580 6625                  case AD_NOSYNC:
6581 6626                          return (ENOTSUP);
6582 6627                  default:
6583 6628                          return (EINVAL);
6584 6629                  }
6585 6630                  break;
6586 6631          case A_REBOOT:
6587 6632                  zcmd = Z_REBOOT;
6588 6633                  break;
6589 6634          case A_FTRACE:
6590 6635          case A_REMOUNT:
6591 6636          case A_FREEZE:
6592 6637          case A_DUMP:
6593 6638          case A_CONFIG:
6594 6639                  return (ENOTSUP);
6595 6640          default:
6596 6641                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6597 6642                  return (EINVAL);
6598 6643          }
6599 6644  
6600 6645          if (secpolicy_zone_admin(credp, B_FALSE))
6601 6646                  return (EPERM);
6602 6647          mutex_enter(&zone_status_lock);
6603 6648  
6604 6649          /*
6605 6650           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6606 6651           * is in the zone.
6607 6652           */
6608 6653          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6609 6654          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6610 6655                  /*
6611 6656                   * This zone is already on its way down.
6612 6657                   */
6613 6658                  mutex_exit(&zone_status_lock);
6614 6659                  return (0);
6615 6660          }
6616 6661          /*
6617 6662           * Prevent future zone_enter()s
6618 6663           */
6619 6664          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6620 6665          mutex_exit(&zone_status_lock);
6621 6666  
6622 6667          /*
6623 6668           * Kill everyone now and call zoneadmd later.
6624 6669           * zone_ki_call_zoneadmd() will do a more thorough job of this
6625 6670           * later.
6626 6671           */
6627 6672          killall(zone->zone_id);
6628 6673          /*
6629 6674           * Now, create the thread to contact zoneadmd and do the rest of the
6630 6675           * work.  This thread can't be created in our zone otherwise
6631 6676           * zone_destroy() would deadlock.
6632 6677           */
6633 6678          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6634 6679          zargp->arg.cmd = zcmd;
6635 6680          zargp->arg.uniqid = zone->zone_uniqid;
6636 6681          zargp->zone = zone;
6637 6682          (void) strcpy(zargp->arg.locale, "C");
6638 6683          /* mdep was already copied in for us by uadmin */
6639 6684          if (mdep != NULL)
6640 6685                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6641 6686                      sizeof (zargp->arg.bootbuf));
6642 6687          zone_hold(zone);
6643 6688  
6644 6689          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6645 6690              TS_RUN, minclsyspri);
6646 6691          exit(CLD_EXITED, 0);
6647 6692  
6648 6693          return (EINVAL);
6649 6694  }
6650 6695  
6651 6696  /*
6652 6697   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6653 6698   * status to ZONE_IS_SHUTTING_DOWN.
6654 6699   *
6655 6700   * This function also shuts down all running zones to ensure that they won't
6656 6701   * fork new processes.
6657 6702   */
6658 6703  void
6659 6704  zone_shutdown_global(void)
6660 6705  {
6661 6706          zone_t *current_zonep;
6662 6707  
6663 6708          ASSERT(INGLOBALZONE(curproc));
6664 6709          mutex_enter(&zonehash_lock);
6665 6710          mutex_enter(&zone_status_lock);
6666 6711  
6667 6712          /* Modify the global zone's status first. */
6668 6713          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6669 6714          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6670 6715  
6671 6716          /*
6672 6717           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6673 6718           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6674 6719           * could cause assertions to fail (e.g., assertions about a zone's
6675 6720           * state during initialization, readying, or booting) or produce races.
6676 6721           * We'll let threads continue to initialize and ready new zones: they'll
6677 6722           * fail to boot the new zones when they see that the global zone is
6678 6723           * shutting down.
6679 6724           */
6680 6725          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6681 6726              current_zonep = list_next(&zone_active, current_zonep)) {
6682 6727                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6683 6728                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6684 6729          }
6685 6730          mutex_exit(&zone_status_lock);
6686 6731          mutex_exit(&zonehash_lock);
6687 6732  }
6688 6733  
6689 6734  /*
6690 6735   * Returns true if the named dataset is visible in the current zone.
6691 6736   * The 'write' parameter is set to 1 if the dataset is also writable.
6692 6737   */
6693 6738  int
6694 6739  zone_dataset_visible(const char *dataset, int *write)
6695 6740  {
6696 6741          static int zfstype = -1;
6697 6742          zone_dataset_t *zd;
6698 6743          size_t len;
6699 6744          zone_t *zone = curproc->p_zone;
6700 6745          const char *name = NULL;
6701 6746          vfs_t *vfsp = NULL;
6702 6747  
6703 6748          if (dataset[0] == '\0')
6704 6749                  return (0);
6705 6750  
6706 6751          /*
6707 6752           * Walk the list once, looking for datasets which match exactly, or
6708 6753           * specify a dataset underneath an exported dataset.  If found, return
6709 6754           * true and note that it is writable.
6710 6755           */
6711 6756          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6712 6757              zd = list_next(&zone->zone_datasets, zd)) {
6713 6758  
6714 6759                  len = strlen(zd->zd_dataset);
6715 6760                  if (strlen(dataset) >= len &&
6716 6761                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6717 6762                      (dataset[len] == '\0' || dataset[len] == '/' ||
6718 6763                      dataset[len] == '@')) {
6719 6764                          if (write)
6720 6765                                  *write = 1;
6721 6766                          return (1);
6722 6767                  }
6723 6768          }
6724 6769  
6725 6770          /*
6726 6771           * Walk the list a second time, searching for datasets which are parents
6727 6772           * of exported datasets.  These should be visible, but read-only.
6728 6773           *
6729 6774           * Note that we also have to support forms such as 'pool/dataset/', with
6730 6775           * a trailing slash.
6731 6776           */
6732 6777          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6733 6778              zd = list_next(&zone->zone_datasets, zd)) {
6734 6779  
6735 6780                  len = strlen(dataset);
6736 6781                  if (dataset[len - 1] == '/')
6737 6782                          len--;  /* Ignore trailing slash */
6738 6783                  if (len < strlen(zd->zd_dataset) &&
6739 6784                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6740 6785                      zd->zd_dataset[len] == '/') {
6741 6786                          if (write)
6742 6787                                  *write = 0;
6743 6788                          return (1);
6744 6789                  }
6745 6790          }
6746 6791  
6747 6792          /*
6748 6793           * We reach here if the given dataset is not found in the zone_dataset
6749 6794           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6750 6795           * instead of delegation. For this we search for the dataset in the
6751 6796           * zone_vfslist of this zone. If found, return true and note that it is
6752 6797           * not writable.
6753 6798           */
6754 6799  
6755 6800          /*
6756 6801           * Initialize zfstype if it is not initialized yet.
6757 6802           */
6758 6803          if (zfstype == -1) {
6759 6804                  struct vfssw *vswp = vfs_getvfssw("zfs");
6760 6805                  zfstype = vswp - vfssw;
6761 6806                  vfs_unrefvfssw(vswp);
6762 6807          }
6763 6808  
6764 6809          vfs_list_read_lock();
6765 6810          vfsp = zone->zone_vfslist;
6766 6811          do {
6767 6812                  ASSERT(vfsp);
6768 6813                  if (vfsp->vfs_fstype == zfstype) {
6769 6814                          name = refstr_value(vfsp->vfs_resource);
6770 6815  
6771 6816                          /*
6772 6817                           * Check if we have an exact match.
6773 6818                           */
6774 6819                          if (strcmp(dataset, name) == 0) {
6775 6820                                  vfs_list_unlock();
6776 6821                                  if (write)
6777 6822                                          *write = 0;
6778 6823                                  return (1);
6779 6824                          }
6780 6825                          /*
6781 6826                           * We need to check if we are looking for parents of
6782 6827                           * a dataset. These should be visible, but read-only.
6783 6828                           */
6784 6829                          len = strlen(dataset);
6785 6830                          if (dataset[len - 1] == '/')
6786 6831                                  len--;
6787 6832  
6788 6833                          if (len < strlen(name) &&
6789 6834                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6790 6835                                  vfs_list_unlock();
6791 6836                                  if (write)
6792 6837                                          *write = 0;
6793 6838                                  return (1);
6794 6839                          }
6795 6840                  }
6796 6841                  vfsp = vfsp->vfs_zone_next;
6797 6842          } while (vfsp != zone->zone_vfslist);
6798 6843  
6799 6844          vfs_list_unlock();
6800 6845          return (0);
6801 6846  }
6802 6847  
6803 6848  /*
6804 6849   * zone_find_by_any_path() -
6805 6850   *
6806 6851   * kernel-private routine similar to zone_find_by_path(), but which
6807 6852   * effectively compares against zone paths rather than zonerootpath
6808 6853   * (i.e., the last component of zonerootpaths, which should be "root/",
6809 6854   * are not compared.)  This is done in order to accurately identify all
6810 6855   * paths, whether zone-visible or not, including those which are parallel
6811 6856   * to /root/, such as /dev/, /home/, etc...
6812 6857   *
6813 6858   * If the specified path does not fall under any zone path then global
6814 6859   * zone is returned.
6815 6860   *
6816 6861   * The treat_abs parameter indicates whether the path should be treated as
6817 6862   * an absolute path although it does not begin with "/".  (This supports
6818 6863   * nfs mount syntax such as host:any/path.)
6819 6864   *
6820 6865   * The caller is responsible for zone_rele of the returned zone.
6821 6866   */
6822 6867  zone_t *
6823 6868  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6824 6869  {
6825 6870          zone_t *zone;
6826 6871          int path_offset = 0;
6827 6872  
6828 6873          if (path == NULL) {
6829 6874                  zone_hold(global_zone);
6830 6875                  return (global_zone);
6831 6876          }
6832 6877  
6833 6878          if (*path != '/') {
6834 6879                  ASSERT(treat_abs);
6835 6880                  path_offset = 1;
6836 6881          }
6837 6882  
6838 6883          mutex_enter(&zonehash_lock);
6839 6884          for (zone = list_head(&zone_active); zone != NULL;
6840 6885              zone = list_next(&zone_active, zone)) {
6841 6886                  char    *c;
6842 6887                  size_t  pathlen;
6843 6888                  char *rootpath_start;
6844 6889  
6845 6890                  if (zone == global_zone)        /* skip global zone */
6846 6891                          continue;
6847 6892  
6848 6893                  /* scan backwards to find start of last component */
6849 6894                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6850 6895                  do {
6851 6896                          c--;
6852 6897                  } while (*c != '/');
6853 6898  
6854 6899                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
6855 6900                  rootpath_start = (zone->zone_rootpath + path_offset);
6856 6901                  if (strncmp(path, rootpath_start, pathlen) == 0)
6857 6902                          break;
6858 6903          }
6859 6904          if (zone == NULL)
6860 6905                  zone = global_zone;
6861 6906          zone_hold(zone);
6862 6907          mutex_exit(&zonehash_lock);
6863 6908          return (zone);
6864 6909  }
6865 6910  
6866 6911  /*
6867 6912   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6868 6913   * zone_dl_t pointer if found, and NULL otherwise.
6869 6914   */
6870 6915  static zone_dl_t *
6871 6916  zone_find_dl(zone_t *zone, datalink_id_t linkid)
6872 6917  {
6873 6918          zone_dl_t *zdl;
6874 6919  
6875 6920          ASSERT(mutex_owned(&zone->zone_lock));
6876 6921          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6877 6922              zdl = list_next(&zone->zone_dl_list, zdl)) {
6878 6923                  if (zdl->zdl_id == linkid)
6879 6924                          break;
6880 6925          }
6881 6926          return (zdl);
6882 6927  }
6883 6928  
6884 6929  static boolean_t
6885 6930  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6886 6931  {
6887 6932          boolean_t exists;
6888 6933  
6889 6934          mutex_enter(&zone->zone_lock);
6890 6935          exists = (zone_find_dl(zone, linkid) != NULL);
6891 6936          mutex_exit(&zone->zone_lock);
6892 6937          return (exists);
6893 6938  }
6894 6939  
6895 6940  /*
6896 6941   * Add an data link name for the zone.
6897 6942   */
6898 6943  static int
6899 6944  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6900 6945  {
6901 6946          zone_dl_t *zdl;
6902 6947          zone_t *zone;
6903 6948          zone_t *thiszone;
6904 6949  
6905 6950          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6906 6951                  return (set_errno(ENXIO));
6907 6952  
6908 6953          /* Verify that the datalink ID doesn't already belong to a zone. */
6909 6954          mutex_enter(&zonehash_lock);
6910 6955          for (zone = list_head(&zone_active); zone != NULL;
6911 6956              zone = list_next(&zone_active, zone)) {
6912 6957                  if (zone_dl_exists(zone, linkid)) {
6913 6958                          mutex_exit(&zonehash_lock);
6914 6959                          zone_rele(thiszone);
6915 6960                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6916 6961                  }
6917 6962          }
6918 6963  
6919 6964          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6920 6965          zdl->zdl_id = linkid;
6921 6966          zdl->zdl_net = NULL;
6922 6967          mutex_enter(&thiszone->zone_lock);
6923 6968          list_insert_head(&thiszone->zone_dl_list, zdl);
6924 6969          mutex_exit(&thiszone->zone_lock);
6925 6970          mutex_exit(&zonehash_lock);
6926 6971          zone_rele(thiszone);
6927 6972          return (0);
6928 6973  }
6929 6974  
6930 6975  static int
6931 6976  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6932 6977  {
6933 6978          zone_dl_t *zdl;
6934 6979          zone_t *zone;
6935 6980          int err = 0;
6936 6981  
6937 6982          if ((zone = zone_find_by_id(zoneid)) == NULL)
6938 6983                  return (set_errno(EINVAL));
6939 6984  
6940 6985          mutex_enter(&zone->zone_lock);
6941 6986          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6942 6987                  err = ENXIO;
6943 6988          } else {
6944 6989                  list_remove(&zone->zone_dl_list, zdl);
6945 6990                  nvlist_free(zdl->zdl_net);
6946 6991                  kmem_free(zdl, sizeof (zone_dl_t));
6947 6992          }
6948 6993          mutex_exit(&zone->zone_lock);
6949 6994          zone_rele(zone);
6950 6995          return (err == 0 ? 0 : set_errno(err));
6951 6996  }
6952 6997  
6953 6998  /*
6954 6999   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6955 7000   * the linkid.  Otherwise we just check if the specified zoneidp has been
6956 7001   * assigned the supplied linkid.
6957 7002   */
6958 7003  int
6959 7004  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6960 7005  {
6961 7006          zone_t *zone;
6962 7007          int err = ENXIO;
6963 7008  
6964 7009          if (*zoneidp != ALL_ZONES) {
6965 7010                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6966 7011                          if (zone_dl_exists(zone, linkid))
6967 7012                                  err = 0;
6968 7013                          zone_rele(zone);
6969 7014                  }
6970 7015                  return (err);
6971 7016          }
6972 7017  
6973 7018          mutex_enter(&zonehash_lock);
6974 7019          for (zone = list_head(&zone_active); zone != NULL;
6975 7020              zone = list_next(&zone_active, zone)) {
6976 7021                  if (zone_dl_exists(zone, linkid)) {
6977 7022                          *zoneidp = zone->zone_id;
6978 7023                          err = 0;
6979 7024                          break;
6980 7025                  }
6981 7026          }
6982 7027          mutex_exit(&zonehash_lock);
6983 7028          return (err);
6984 7029  }
6985 7030  
6986 7031  /*
6987 7032   * Get the list of datalink IDs assigned to a zone.
6988 7033   *
6989 7034   * On input, *nump is the number of datalink IDs that can fit in the supplied
6990 7035   * idarray.  Upon return, *nump is either set to the number of datalink IDs
6991 7036   * that were placed in the array if the array was large enough, or to the
6992 7037   * number of datalink IDs that the function needs to place in the array if the
6993 7038   * array is too small.
6994 7039   */
6995 7040  static int
6996 7041  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6997 7042  {
6998 7043          uint_t num, dlcount;
6999 7044          zone_t *zone;
7000 7045          zone_dl_t *zdl;
7001 7046          datalink_id_t *idptr = idarray;
7002 7047  
7003 7048          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7004 7049                  return (set_errno(EFAULT));
7005 7050          if ((zone = zone_find_by_id(zoneid)) == NULL)
7006 7051                  return (set_errno(ENXIO));
7007 7052  
7008 7053          num = 0;
7009 7054          mutex_enter(&zone->zone_lock);
7010 7055          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7011 7056              zdl = list_next(&zone->zone_dl_list, zdl)) {
7012 7057                  /*
7013 7058                   * If the list is bigger than what the caller supplied, just
7014 7059                   * count, don't do copyout.
7015 7060                   */
7016 7061                  if (++num > dlcount)
7017 7062                          continue;
7018 7063                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7019 7064                          mutex_exit(&zone->zone_lock);
7020 7065                          zone_rele(zone);
7021 7066                          return (set_errno(EFAULT));
7022 7067                  }
7023 7068                  idptr++;
7024 7069          }
7025 7070          mutex_exit(&zone->zone_lock);
7026 7071          zone_rele(zone);
7027 7072  
7028 7073          /* Increased or decreased, caller should be notified. */
7029 7074          if (num != dlcount) {
7030 7075                  if (copyout(&num, nump, sizeof (num)) != 0)
7031 7076                          return (set_errno(EFAULT));
7032 7077          }
7033 7078          return (0);
7034 7079  }
7035 7080  
7036 7081  /*
7037 7082   * Public interface for looking up a zone by zoneid. It's a customized version
7038 7083   * for netstack_zone_create(). It can only be called from the zsd create
7039 7084   * callbacks, since it doesn't have reference on the zone structure hence if
7040 7085   * it is called elsewhere the zone could disappear after the zonehash_lock
7041 7086   * is dropped.
7042 7087   *
7043 7088   * Furthermore it
7044 7089   * 1. Doesn't check the status of the zone.
7045 7090   * 2. It will be called even before zone_init is called, in that case the
7046 7091   *    address of zone0 is returned directly, and netstack_zone_create()
7047 7092   *    will only assign a value to zone0.zone_netstack, won't break anything.
7048 7093   * 3. Returns without the zone being held.
7049 7094   */
7050 7095  zone_t *
7051 7096  zone_find_by_id_nolock(zoneid_t zoneid)
7052 7097  {
7053 7098          zone_t *zone;
7054 7099  
7055 7100          mutex_enter(&zonehash_lock);
7056 7101          if (zonehashbyid == NULL)
7057 7102                  zone = &zone0;
7058 7103          else
7059 7104                  zone = zone_find_all_by_id(zoneid);
7060 7105          mutex_exit(&zonehash_lock);
7061 7106          return (zone);
7062 7107  }
7063 7108  
7064 7109  /*
7065 7110   * Walk the datalinks for a given zone
7066 7111   */
7067 7112  int
7068 7113  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7069 7114      void *data)
7070 7115  {
7071 7116          zone_t          *zone;
7072 7117          zone_dl_t       *zdl;
7073 7118          datalink_id_t   *idarray;
7074 7119          uint_t          idcount = 0;
7075 7120          int             i, ret = 0;
7076 7121  
7077 7122          if ((zone = zone_find_by_id(zoneid)) == NULL)
7078 7123                  return (ENOENT);
7079 7124  
7080 7125          /*
7081 7126           * We first build an array of linkid's so that we can walk these and
7082 7127           * execute the callback with the zone_lock dropped.
7083 7128           */
7084 7129          mutex_enter(&zone->zone_lock);
7085 7130          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7086 7131              zdl = list_next(&zone->zone_dl_list, zdl)) {
7087 7132                  idcount++;
7088 7133          }
7089 7134  
7090 7135          if (idcount == 0) {
7091 7136                  mutex_exit(&zone->zone_lock);
7092 7137                  zone_rele(zone);
7093 7138                  return (0);
7094 7139          }
7095 7140  
7096 7141          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7097 7142          if (idarray == NULL) {
7098 7143                  mutex_exit(&zone->zone_lock);
7099 7144                  zone_rele(zone);
7100 7145                  return (ENOMEM);
7101 7146          }
7102 7147  
7103 7148          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7104 7149              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7105 7150                  idarray[i] = zdl->zdl_id;
7106 7151          }
7107 7152  
7108 7153          mutex_exit(&zone->zone_lock);
7109 7154  
7110 7155          for (i = 0; i < idcount && ret == 0; i++) {
7111 7156                  if ((ret = (*cb)(idarray[i], data)) != 0)
7112 7157                          break;
7113 7158          }
7114 7159  
7115 7160          zone_rele(zone);
7116 7161          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7117 7162          return (ret);
7118 7163  }
7119 7164  
7120 7165  static char *
7121 7166  zone_net_type2name(int type)
7122 7167  {
7123 7168          switch (type) {
7124 7169          case ZONE_NETWORK_ADDRESS:
7125 7170                  return (ZONE_NET_ADDRNAME);
7126 7171          case ZONE_NETWORK_DEFROUTER:
7127 7172                  return (ZONE_NET_RTRNAME);
7128 7173          default:
7129 7174                  return (NULL);
7130 7175          }
7131 7176  }
7132 7177  
7133 7178  static int
7134 7179  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7135 7180  {
7136 7181          zone_t *zone;
7137 7182          zone_dl_t *zdl;
7138 7183          nvlist_t *nvl;
7139 7184          int err = 0;
7140 7185          uint8_t *new = NULL;
7141 7186          char *nvname;
7142 7187          int bufsize;
7143 7188          datalink_id_t linkid = znbuf->zn_linkid;
7144 7189  
7145 7190          if (secpolicy_zone_config(CRED()) != 0)
7146 7191                  return (set_errno(EPERM));
7147 7192  
7148 7193          if (zoneid == GLOBAL_ZONEID)
7149 7194                  return (set_errno(EINVAL));
7150 7195  
7151 7196          nvname = zone_net_type2name(znbuf->zn_type);
7152 7197          bufsize = znbuf->zn_len;
7153 7198          new = znbuf->zn_val;
7154 7199          if (nvname == NULL)
7155 7200                  return (set_errno(EINVAL));
7156 7201  
7157 7202          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7158 7203                  return (set_errno(EINVAL));
7159 7204          }
7160 7205  
7161 7206          mutex_enter(&zone->zone_lock);
7162 7207          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7163 7208                  err = ENXIO;
7164 7209                  goto done;
7165 7210          }
7166 7211          if ((nvl = zdl->zdl_net) == NULL) {
7167 7212                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7168 7213                          err = ENOMEM;
7169 7214                          goto done;
7170 7215                  } else {
7171 7216                          zdl->zdl_net = nvl;
7172 7217                  }
7173 7218          }
7174 7219          if (nvlist_exists(nvl, nvname)) {
7175 7220                  err = EINVAL;
7176 7221                  goto done;
7177 7222          }
7178 7223          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7179 7224          ASSERT(err == 0);
7180 7225  done:
7181 7226          mutex_exit(&zone->zone_lock);
7182 7227          zone_rele(zone);
7183 7228          if (err != 0)
7184 7229                  return (set_errno(err));
7185 7230          else
7186 7231                  return (0);
7187 7232  }
7188 7233  
7189 7234  static int
7190 7235  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7191 7236  {
7192 7237          zone_t *zone;
7193 7238          zone_dl_t *zdl;
7194 7239          nvlist_t *nvl;
7195 7240          uint8_t *ptr;
7196 7241          uint_t psize;
7197 7242          int err = 0;
7198 7243          char *nvname;
7199 7244          int bufsize;
7200 7245          void *buf;
7201 7246          datalink_id_t linkid = znbuf->zn_linkid;
7202 7247  
7203 7248          if (zoneid == GLOBAL_ZONEID)
7204 7249                  return (set_errno(EINVAL));
7205 7250  
7206 7251          nvname = zone_net_type2name(znbuf->zn_type);
7207 7252          bufsize = znbuf->zn_len;
7208 7253          buf = znbuf->zn_val;
7209 7254  
7210 7255          if (nvname == NULL)
7211 7256                  return (set_errno(EINVAL));
7212 7257          if ((zone = zone_find_by_id(zoneid)) == NULL)
7213 7258                  return (set_errno(EINVAL));
7214 7259  
7215 7260          mutex_enter(&zone->zone_lock);
7216 7261          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7217 7262                  err = ENXIO;
7218 7263                  goto done;
7219 7264          }
7220 7265          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7221 7266                  err = ENOENT;
7222 7267                  goto done;
7223 7268          }
7224 7269          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7225 7270          ASSERT(err == 0);
7226 7271  
7227 7272          if (psize > bufsize) {
7228 7273                  err = ENOBUFS;
7229 7274                  goto done;
7230 7275          }
7231 7276          znbuf->zn_len = psize;
7232 7277          bcopy(ptr, buf, psize);
7233 7278  done:
7234 7279          mutex_exit(&zone->zone_lock);
7235 7280          zone_rele(zone);
7236 7281          if (err != 0)
7237 7282                  return (set_errno(err));
7238 7283          else
7239 7284                  return (0);
7240 7285  }

↓ open down ↓

1614 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX