illumos-gate Wdiff usr/src/uts/common/os/zone.c

Print this page

9936 atomic ops in syscall_mstate() induce significant overhead
9942 zone secflags are not initialized correctly

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015, Joyent Inc. All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27   27   */
  28   28  
  29   29  /*
  30   30   * Zones
  31   31   *
  32   32   *   A zone is a named collection of processes, namespace constraints,
  33   33   *   and other system resources which comprise a secure and manageable
  34   34   *   application containment facility.
  35   35   *
  36   36   *   Zones (represented by the reference counted zone_t) are tracked in
  37   37   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38   38   *   (zoneid_t) are used to track zone association.  Zone IDs are
  39   39   *   dynamically generated when the zone is created; if a persistent
  40   40   *   identifier is needed (core files, accounting logs, audit trail,
  41   41   *   etc.), the zone name should be used.
  42   42   *
  43   43   *
  44   44   *   Global Zone:
  45   45   *
  46   46   *   The global zone (zoneid 0) is automatically associated with all
  47   47   *   system resources that have not been bound to a user-created zone.
  48   48   *   This means that even systems where zones are not in active use
  49   49   *   have a global zone, and all processes, mounts, etc. are
  50   50   *   associated with that zone.  The global zone is generally
  51   51   *   unconstrained in terms of privileges and access, though the usual
  52   52   *   credential and privilege based restrictions apply.
  53   53   *
  54   54   *
  55   55   *   Zone States:
  56   56   *
  57   57   *   The states in which a zone may be in and the transitions are as
  58   58   *   follows:
  59   59   *
  60   60   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61   61   *   initialized zone is added to the list of active zones on the system but
  62   62   *   isn't accessible.
  63   63   *
  64   64   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65   65   *   not yet completed. Not possible to enter the zone, but attributes can
  66   66   *   be retrieved.
  67   67   *
  68   68   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69   69   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70   70   *   executed.  A zone remains in this state until it transitions into
  71   71   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72   72   *
  73   73   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74   74   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75   75   *   state.
  76   76   *
  77   77   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78   78   *   successfully started init.   A zone remains in this state until
  79   79   *   zone_shutdown() is called.
  80   80   *
  81   81   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82   82   *   killing all processes running in the zone. The zone remains
  83   83   *   in this state until there are no more user processes running in the zone.
  84   84   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85   85   *   Since zone_shutdown() is restartable, it may be called successfully
  86   86   *   multiple times for the same zone_t.  Setting of the zone's state to
  87   87   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88   88   *   the zone's status without worrying about it being a moving target.
  89   89   *
  90   90   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91   91   *   are no more user processes in the zone.  The zone remains in this
  92   92   *   state until there are no more kernel threads associated with the
  93   93   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94   94   *   fail.
  95   95   *
  96   96   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97   97   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98   98   *   join the zone or create kernel threads therein.
  99   99   *
 100  100   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  101   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  102   *   return NULL from now on.
 103  103   *
 104  104   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  105   *   processes or threads doing work on behalf of the zone.  The zone is
 106  106   *   removed from the list of active zones.  zone_destroy() returns, and
 107  107   *   the zone can be recreated.
 108  108   *
 109  109   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  110   *   callbacks are executed, and all memory associated with the zone is
 111  111   *   freed.
 112  112   *
 113  113   *   Threads can wait for the zone to enter a requested state by using
 114  114   *   zone_status_wait() or zone_status_timedwait() with the desired
 115  115   *   state passed in as an argument.  Zone state transitions are
 116  116   *   uni-directional; it is not possible to move back to an earlier state.
 117  117   *
 118  118   *
 119  119   *   Zone-Specific Data:
 120  120   *
 121  121   *   Subsystems needing to maintain zone-specific data can store that
 122  122   *   data using the ZSD mechanism.  This provides a zone-specific data
 123  123   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  124   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  125   *   to register callbacks to be invoked when a zone is created, shut
 126  126   *   down, or destroyed.  This can be used to initialize zone-specific
 127  127   *   data for new zones and to clean up when zones go away.
 128  128   *
 129  129   *
 130  130   *   Data Structures:
 131  131   *
 132  132   *   The per-zone structure (zone_t) is reference counted, and freed
 133  133   *   when all references are released.  zone_hold and zone_rele can be
 134  134   *   used to adjust the reference count.  In addition, reference counts
 135  135   *   associated with the cred_t structure are tracked separately using
 136  136   *   zone_cred_hold and zone_cred_rele.
 137  137   *
 138  138   *   Pointers to active zone_t's are stored in two hash tables; one
 139  139   *   for searching by id, the other for searching by name.  Lookups
 140  140   *   can be performed on either basis, using zone_find_by_id and
 141  141   *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  142   *   held, so zone_rele should be called when the pointer is no longer
 143  143   *   needed.  Zones can also be searched by path; zone_find_by_path
 144  144   *   returns the zone with which a path name is associated (global
 145  145   *   zone if the path is not within some other zone's file system
 146  146   *   hierarchy).  This currently requires iterating through each zone,
 147  147   *   so it is slower than an id or name search via a hash table.
 148  148   *
 149  149   *
 150  150   *   Locking:
 151  151   *
 152  152   *   zonehash_lock: This is a top-level global lock used to protect the
 153  153   *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  154   *       while this lock is held.
 155  155   *   zone_status_lock: This is a global lock protecting zone state.
 156  156   *       Zones cannot change state while this lock is held.  It also
 157  157   *       protects the list of kernel threads associated with a zone.
 158  158   *   zone_lock: This is a per-zone lock used to protect several fields of
 159  159   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  160   *       this lock means that the zone cannot go away.
 161  161   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-lwps rctl.

↓ open down ↓

162 lines elided

↑ open up ↑

 163  163   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  164   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  165   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  166   *       currently just max_lofi
 167  167   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  168   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  169   *       list (a list of zones in the ZONE_IS_DEAD state).
 170  170   *
 171  171   *   Ordering requirements:
 172  172   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173      - *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
      173 + *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  174   *
 175  175   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  176   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  177   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  178   *
 179  179   *   Blocking memory allocations are permitted while holding any of the
 180  180   *   zone locks.
 181  181   *
 182  182   *
 183  183   *   System Call Interface:

 184  184   *
 185  185   *   The zone subsystem can be managed and queried from user level with
 186  186   *   the following system calls (all subcodes of the primary "zone"
 187  187   *   system call):
 188  188   *   - zone_create: creates a zone with selected attributes (name,
 189  189   *     root path, privileges, resource controls, ZFS datasets)
 190  190   *   - zone_enter: allows the current process to enter a zone
 191  191   *   - zone_getattr: reports attributes of a zone
 192  192   *   - zone_setattr: set attributes of a zone
 193  193   *   - zone_boot: set 'init' running for the zone
 194  194   *   - zone_list: lists all zones active in the system
 195  195   *   - zone_lookup: looks up zone id based on name
 196  196   *   - zone_shutdown: initiates shutdown process (see states above)
 197  197   *   - zone_destroy: completes shutdown process (see states above)
 198  198   *
 199  199   */
 200  200  
 201  201  #include <sys/priv_impl.h>
 202  202  #include <sys/cred.h>
 203  203  #include <c2/audit.h>
 204  204  #include <sys/debug.h>
 205  205  #include <sys/file.h>
 206  206  #include <sys/kmem.h>
 207  207  #include <sys/kstat.h>
 208  208  #include <sys/mutex.h>
 209  209  #include <sys/note.h>
 210  210  #include <sys/pathname.h>
 211  211  #include <sys/proc.h>
 212  212  #include <sys/project.h>
 213  213  #include <sys/sysevent.h>
 214  214  #include <sys/task.h>
 215  215  #include <sys/systm.h>
 216  216  #include <sys/types.h>
 217  217  #include <sys/utsname.h>
 218  218  #include <sys/vnode.h>
 219  219  #include <sys/vfs.h>
 220  220  #include <sys/systeminfo.h>
 221  221  #include <sys/policy.h>
 222  222  #include <sys/cred_impl.h>
 223  223  #include <sys/contract_impl.h>
 224  224  #include <sys/contract/process_impl.h>
 225  225  #include <sys/class.h>
 226  226  #include <sys/pool.h>
 227  227  #include <sys/pool_pset.h>
 228  228  #include <sys/pset.h>
 229  229  #include <sys/strlog.h>
 230  230  #include <sys/sysmacros.h>
 231  231  #include <sys/callb.h>
 232  232  #include <sys/vmparam.h>
 233  233  #include <sys/corectl.h>
 234  234  #include <sys/ipc_impl.h>
 235  235  #include <sys/klpd.h>
 236  236  
 237  237  #include <sys/door.h>
 238  238  #include <sys/cpuvar.h>
 239  239  #include <sys/sdt.h>
 240  240  
 241  241  #include <sys/uadmin.h>
 242  242  #include <sys/session.h>
 243  243  #include <sys/cmn_err.h>
 244  244  #include <sys/modhash.h>
 245  245  #include <sys/sunddi.h>
 246  246  #include <sys/nvpair.h>
 247  247  #include <sys/rctl.h>
 248  248  #include <sys/fss.h>
 249  249  #include <sys/brand.h>
 250  250  #include <sys/zone.h>
 251  251  #include <net/if.h>
 252  252  #include <sys/cpucaps.h>
 253  253  #include <vm/seg.h>
 254  254  #include <sys/mac.h>
 255  255  
 256  256  /*
 257  257   * This constant specifies the number of seconds that threads waiting for
 258  258   * subsystems to release a zone's general-purpose references will wait before
 259  259   * they log the zone's reference counts.  The constant's value shouldn't
 260  260   * be so small that reference counts are unnecessarily reported for zones
 261  261   * whose references are slowly released.  On the other hand, it shouldn't be so
 262  262   * large that users reboot their systems out of frustration over hung zones
 263  263   * before the system logs the zones' reference counts.
 264  264   */
 265  265  #define ZONE_DESTROY_TIMEOUT_SECS       60
 266  266  
 267  267  /* List of data link IDs which are accessible from the zone */
 268  268  typedef struct zone_dl {
 269  269          datalink_id_t   zdl_id;
 270  270          nvlist_t        *zdl_net;
 271  271          list_node_t     zdl_linkage;
 272  272  } zone_dl_t;
 273  273  
 274  274  /*
 275  275   * cv used to signal that all references to the zone have been released.  This
 276  276   * needs to be global since there may be multiple waiters, and the first to
 277  277   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  278   */
 279  279  static kcondvar_t zone_destroy_cv;
 280  280  /*
 281  281   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  282   * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  283   */
 284  284  static kmutex_t zone_status_lock;
 285  285  
 286  286  /*
 287  287   * ZSD-related global variables.
 288  288   */
 289  289  static kmutex_t zsd_key_lock;   /* protects the following two */
 290  290  /*
 291  291   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  292   */
 293  293  static zone_key_t zsd_keyval = 0;
 294  294  /*
 295  295   * Global list of registered keys.  We use this when a new zone is created.
 296  296   */
 297  297  static list_t zsd_registered_keys;
 298  298  
 299  299  int zone_hash_size = 256;
 300  300  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301  301  static kmutex_t zonehash_lock;
 302  302  static uint_t zonecount;
 303  303  static id_space_t *zoneid_space;
 304  304  
 305  305  /*
 306  306   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  307   * kernel proper runs, and which manages all other zones.
 308  308   *
 309  309   * Although not declared as static, the variable "zone0" should not be used
 310  310   * except for by code that needs to reference the global zone early on in boot,
 311  311   * before it is fully initialized.  All other consumers should use
 312  312   * 'global_zone'.
 313  313   */
 314  314  zone_t zone0;
 315  315  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316  316  
 317  317  /*
 318  318   * List of active zones, protected by zonehash_lock.
 319  319   */
 320  320  static list_t zone_active;
 321  321  
 322  322  /*
 323  323   * List of destroyed zones that still have outstanding cred references.
 324  324   * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  325   * problems in zone_free.
 326  326   */
 327  327  static list_t zone_deathrow;
 328  328  static kmutex_t zone_deathrow_lock;
 329  329  
 330  330  /* number of zones is limited by virtual interface limit in IP */
 331  331  uint_t maxzones = 8192;
 332  332  
 333  333  /* Event channel to sent zone state change notifications */
 334  334  evchan_t *zone_event_chan;
 335  335  
 336  336  /*
 337  337   * This table holds the mapping from kernel zone states to
 338  338   * states visible in the state notification API.
 339  339   * The idea is that we only expose "obvious" states and
 340  340   * do not expose states which are just implementation details.
 341  341   */
 342  342  const char  *zone_status_table[] = {
 343  343          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344  344          ZONE_EVENT_INITIALIZED,         /* initialized */
 345  345          ZONE_EVENT_READY,               /* ready */
 346  346          ZONE_EVENT_READY,               /* booting */
 347  347          ZONE_EVENT_RUNNING,             /* running */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350  350          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351  351          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352  352          ZONE_EVENT_UNINITIALIZED,       /* dead */
 353  353  };
 354  354  
 355  355  /*
 356  356   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  357   * (see sys/zone.h).
 358  358   */
 359  359  static char *zone_ref_subsys_names[] = {
 360  360          "NFS",          /* ZONE_REF_NFS */
 361  361          "NFSv4",        /* ZONE_REF_NFSV4 */
 362  362          "SMBFS",        /* ZONE_REF_SMBFS */
 363  363          "MNTFS",        /* ZONE_REF_MNTFS */
 364  364          "LOFI",         /* ZONE_REF_LOFI */
 365  365          "VFS",          /* ZONE_REF_VFS */
 366  366          "IPC"           /* ZONE_REF_IPC */
 367  367  };
 368  368  
 369  369  /*
 370  370   * This isn't static so lint doesn't complain.
 371  371   */
 372  372  rctl_hndl_t rc_zone_cpu_shares;
 373  373  rctl_hndl_t rc_zone_locked_mem;
 374  374  rctl_hndl_t rc_zone_max_swap;
 375  375  rctl_hndl_t rc_zone_max_lofi;
 376  376  rctl_hndl_t rc_zone_cpu_cap;
 377  377  rctl_hndl_t rc_zone_nlwps;
 378  378  rctl_hndl_t rc_zone_nprocs;
 379  379  rctl_hndl_t rc_zone_shmmax;
 380  380  rctl_hndl_t rc_zone_shmmni;
 381  381  rctl_hndl_t rc_zone_semmni;
 382  382  rctl_hndl_t rc_zone_msgmni;
 383  383  
 384  384  const char * const zone_default_initname = "/sbin/init";
 385  385  static char * const zone_prefix = "/zone/";
 386  386  static int zone_shutdown(zoneid_t zoneid);
 387  387  static int zone_add_datalink(zoneid_t, datalink_id_t);
 388  388  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389  389  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390  390  static int zone_set_network(zoneid_t, zone_net_data_t *);
 391  391  static int zone_get_network(zoneid_t, zone_net_data_t *);
 392  392  
 393  393  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394  394  
 395  395  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396  396  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397  397  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398  398  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399  399      zone_key_t);
 400  400  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401  401  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404  404      kmutex_t *);
 405  405  
 406  406  /*
 407  407   * Bump this number when you alter the zone syscall interfaces; this is
 408  408   * because we need to have support for previous API versions in libc
 409  409   * to support patching; libc calls into the kernel to determine this number.
 410  410   *
 411  411   * Version 1 of the API is the version originally shipped with Solaris 10
 412  412   * Version 2 alters the zone_create system call in order to support more
 413  413   *     arguments by moving the args into a structure; and to do better
 414  414   *     error reporting when zone_create() fails.
 415  415   * Version 3 alters the zone_create system call in order to support the
 416  416   *     import of ZFS datasets to zones.
 417  417   * Version 4 alters the zone_create system call in order to support
 418  418   *     Trusted Extensions.
 419  419   * Version 5 alters the zone_boot system call, and converts its old
 420  420   *     bootargs parameter to be set by the zone_setattr API instead.
 421  421   * Version 6 adds the flag argument to zone_create.
 422  422   */
 423  423  static const int ZONE_SYSCALL_API_VERSION = 6;
 424  424  
 425  425  /*
 426  426   * Certain filesystems (such as NFS and autofs) need to know which zone
 427  427   * the mount is being placed in.  Because of this, we need to be able to
 428  428   * ensure that a zone isn't in the process of being created/destroyed such
 429  429   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  430   * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  431   * mount list. Since a zone can't reside on an NFS file system, we don't
 432  432   * have to worry about the zonepath itself.
 433  433   *
 434  434   * The following functions: block_mounts()/resume_mounts() and
 435  435   * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  436   * layer (respectively) to synchronize zone state transitions and new
 437  437   * mounts within a zone. This syncronization is on a per-zone basis, so
 438  438   * activity for one zone will not interfere with activity for another zone.
 439  439   *
 440  440   * The semantics are like a reader-reader lock such that there may
 441  441   * either be multiple mounts (or zone state transitions, if that weren't
 442  442   * serialized by zonehash_lock) in progress at the same time, but not
 443  443   * both.
 444  444   *
 445  445   * We use cv's so the user can ctrl-C out of the operation if it's
 446  446   * taking too long.
 447  447   *
 448  448   * The semantics are such that there is unfair bias towards the
 449  449   * "current" operation.  This means that zone halt may starve if
 450  450   * there is a rapid succession of new mounts coming in to the zone.
 451  451   */
 452  452  /*
 453  453   * Prevent new mounts from progressing to the point of calling
 454  454   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  455   * them to complete.
 456  456   */
 457  457  static int
 458  458  block_mounts(zone_t *zp)
 459  459  {
 460  460          int retval = 0;
 461  461  
 462  462          /*
 463  463           * Since it may block for a long time, block_mounts() shouldn't be
 464  464           * called with zonehash_lock held.
 465  465           */
 466  466          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467  467          mutex_enter(&zp->zone_mount_lock);
 468  468          while (zp->zone_mounts_in_progress > 0) {
 469  469                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470  470                          goto signaled;
 471  471          }
 472  472          /*
 473  473           * A negative value of mounts_in_progress indicates that mounts
 474  474           * have been blocked by (-mounts_in_progress) different callers
 475  475           * (remotely possible if two threads enter zone_shutdown at the same
 476  476           * time).
 477  477           */
 478  478          zp->zone_mounts_in_progress--;
 479  479          retval = 1;
 480  480  signaled:
 481  481          mutex_exit(&zp->zone_mount_lock);
 482  482          return (retval);
 483  483  }
 484  484  
 485  485  /*
 486  486   * The VFS layer may progress with new mounts as far as we're concerned.
 487  487   * Allow them to progress if we were the last obstacle.
 488  488   */
 489  489  static void
 490  490  resume_mounts(zone_t *zp)
 491  491  {
 492  492          mutex_enter(&zp->zone_mount_lock);
 493  493          if (++zp->zone_mounts_in_progress == 0)
 494  494                  cv_broadcast(&zp->zone_mount_cv);
 495  495          mutex_exit(&zp->zone_mount_lock);
 496  496  }
 497  497  
 498  498  /*
 499  499   * The VFS layer is busy with a mount; this zone should wait until all
 500  500   * of its mounts are completed to progress.
 501  501   */
 502  502  void
 503  503  mount_in_progress(zone_t *zp)
 504  504  {
 505  505          mutex_enter(&zp->zone_mount_lock);
 506  506          while (zp->zone_mounts_in_progress < 0)
 507  507                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508  508          zp->zone_mounts_in_progress++;
 509  509          mutex_exit(&zp->zone_mount_lock);
 510  510  }
 511  511  
 512  512  /*
 513  513   * VFS is done with one mount; wake up any waiting block_mounts()
 514  514   * callers if this is the last mount.
 515  515   */
 516  516  void
 517  517  mount_completed(zone_t *zp)
 518  518  {
 519  519          mutex_enter(&zp->zone_mount_lock);
 520  520          if (--zp->zone_mounts_in_progress == 0)
 521  521                  cv_broadcast(&zp->zone_mount_cv);
 522  522          mutex_exit(&zp->zone_mount_lock);
 523  523  }
 524  524  
 525  525  /*
 526  526   * ZSD routines.
 527  527   *
 528  528   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  529   * defined by the pthread_key_create() and related interfaces.
 530  530   *
 531  531   * Kernel subsystems may register one or more data items and/or
 532  532   * callbacks to be executed when a zone is created, shutdown, or
 533  533   * destroyed.
 534  534   *
 535  535   * Unlike the thread counterpart, destructor callbacks will be executed
 536  536   * even if the data pointer is NULL and/or there are no constructor
 537  537   * callbacks, so it is the responsibility of such callbacks to check for
 538  538   * NULL data values if necessary.
 539  539   *
 540  540   * The locking strategy and overall picture is as follows:
 541  541   *
 542  542   * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  543   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  544   * holding that lock all the existing zones are marked as
 545  545   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  546   * zone_zsd list (protected by zone_lock). The global list is updated first
 547  547   * (under zone_key_lock) to make sure that newly created zones use the
 548  548   * most recent list of keys. Then under zonehash_lock we walk the zones
 549  549   * and mark them.  Similar locking is used in zone_key_delete().
 550  550   *
 551  551   * The actual create, shutdown, and destroy callbacks are done without
 552  552   * holding any lock. And zsd_flags are used to ensure that the operations
 553  553   * completed so that when zone_key_create (and zone_create) is done, as well as
 554  554   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  555   * are completed.
 556  556   *
 557  557   * When new zones are created constructor callbacks for all registered ZSD
 558  558   * entries will be called. That also uses the above two phases of marking
 559  559   * what needs to be done, and then running the callbacks without holding
 560  560   * any locks.
 561  561   *
 562  562   * The framework does not provide any locking around zone_getspecific() and
 563  563   * zone_setspecific() apart from that needed for internal consistency, so
 564  564   * callers interested in atomic "test-and-set" semantics will need to provide
 565  565   * their own locking.
 566  566   */
 567  567  
 568  568  /*
 569  569   * Helper function to find the zsd_entry associated with the key in the
 570  570   * given list.
 571  571   */
 572  572  static struct zsd_entry *
 573  573  zsd_find(list_t *l, zone_key_t key)
 574  574  {
 575  575          struct zsd_entry *zsd;
 576  576  
 577  577          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578  578                  if (zsd->zsd_key == key) {
 579  579                          return (zsd);
 580  580                  }
 581  581          }
 582  582          return (NULL);
 583  583  }
 584  584  
 585  585  /*
 586  586   * Helper function to find the zsd_entry associated with the key in the
 587  587   * given list. Move it to the front of the list.
 588  588   */
 589  589  static struct zsd_entry *
 590  590  zsd_find_mru(list_t *l, zone_key_t key)
 591  591  {
 592  592          struct zsd_entry *zsd;
 593  593  
 594  594          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595  595                  if (zsd->zsd_key == key) {
 596  596                          /*
 597  597                           * Move to head of list to keep list in MRU order.
 598  598                           */
 599  599                          if (zsd != list_head(l)) {
 600  600                                  list_remove(l, zsd);
 601  601                                  list_insert_head(l, zsd);
 602  602                          }
 603  603                          return (zsd);
 604  604                  }
 605  605          }
 606  606          return (NULL);
 607  607  }
 608  608  
 609  609  void
 610  610  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611  611      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612  612  {
 613  613          struct zsd_entry *zsdp;
 614  614          struct zsd_entry *t;
 615  615          struct zone *zone;
 616  616          zone_key_t  key;
 617  617  
 618  618          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619  619          zsdp->zsd_data = NULL;
 620  620          zsdp->zsd_create = create;
 621  621          zsdp->zsd_shutdown = shutdown;
 622  622          zsdp->zsd_destroy = destroy;
 623  623  
 624  624          /*
 625  625           * Insert in global list of callbacks. Makes future zone creations
 626  626           * see it.
 627  627           */
 628  628          mutex_enter(&zsd_key_lock);
 629  629          key = zsdp->zsd_key = ++zsd_keyval;
 630  630          ASSERT(zsd_keyval != 0);
 631  631          list_insert_tail(&zsd_registered_keys, zsdp);
 632  632          mutex_exit(&zsd_key_lock);
 633  633  
 634  634          /*
 635  635           * Insert for all existing zones and mark them as needing
 636  636           * a create callback.
 637  637           */
 638  638          mutex_enter(&zonehash_lock);    /* stop the world */
 639  639          for (zone = list_head(&zone_active); zone != NULL;
 640  640              zone = list_next(&zone_active, zone)) {
 641  641                  zone_status_t status;
 642  642  
 643  643                  mutex_enter(&zone->zone_lock);
 644  644  
 645  645                  /* Skip zones that are on the way down or not yet up */
 646  646                  status = zone_status_get(zone);
 647  647                  if (status >= ZONE_IS_DOWN ||
 648  648                      status == ZONE_IS_UNINITIALIZED) {
 649  649                          mutex_exit(&zone->zone_lock);
 650  650                          continue;
 651  651                  }
 652  652  
 653  653                  t = zsd_find_mru(&zone->zone_zsd, key);
 654  654                  if (t != NULL) {
 655  655                          /*
 656  656                           * A zsd_configure already inserted it after
 657  657                           * we dropped zsd_key_lock above.
 658  658                           */
 659  659                          mutex_exit(&zone->zone_lock);
 660  660                          continue;
 661  661                  }
 662  662                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663  663                  t->zsd_key = key;
 664  664                  t->zsd_create = create;
 665  665                  t->zsd_shutdown = shutdown;
 666  666                  t->zsd_destroy = destroy;
 667  667                  if (create != NULL) {
 668  668                          t->zsd_flags = ZSD_CREATE_NEEDED;
 669  669                          DTRACE_PROBE2(zsd__create__needed,
 670  670                              zone_t *, zone, zone_key_t, key);
 671  671                  }
 672  672                  list_insert_tail(&zone->zone_zsd, t);
 673  673                  mutex_exit(&zone->zone_lock);
 674  674          }
 675  675          mutex_exit(&zonehash_lock);
 676  676  
 677  677          if (create != NULL) {
 678  678                  /* Now call the create callback for this key */
 679  679                  zsd_apply_all_zones(zsd_apply_create, key);
 680  680          }
 681  681          /*
 682  682           * It is safe for consumers to use the key now, make it
 683  683           * globally visible. Specifically zone_getspecific() will
 684  684           * always successfully return the zone specific data associated
 685  685           * with the key.
 686  686           */
 687  687          *keyp = key;
 688  688  
 689  689  }
 690  690  
 691  691  /*
 692  692   * Function called when a module is being unloaded, or otherwise wishes
 693  693   * to unregister its ZSD key and callbacks.
 694  694   *
 695  695   * Remove from the global list and determine the functions that need to
 696  696   * be called under a global lock. Then call the functions without
 697  697   * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  698   * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  699   */
 700  700  int
 701  701  zone_key_delete(zone_key_t key)
 702  702  {
 703  703          struct zsd_entry *zsdp = NULL;
 704  704          zone_t *zone;
 705  705  
 706  706          mutex_enter(&zsd_key_lock);
 707  707          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708  708          if (zsdp == NULL) {
 709  709                  mutex_exit(&zsd_key_lock);
 710  710                  return (-1);
 711  711          }
 712  712          list_remove(&zsd_registered_keys, zsdp);
 713  713          mutex_exit(&zsd_key_lock);
 714  714  
 715  715          mutex_enter(&zonehash_lock);
 716  716          for (zone = list_head(&zone_active); zone != NULL;
 717  717              zone = list_next(&zone_active, zone)) {
 718  718                  struct zsd_entry *del;
 719  719  
 720  720                  mutex_enter(&zone->zone_lock);
 721  721                  del = zsd_find_mru(&zone->zone_zsd, key);
 722  722                  if (del == NULL) {
 723  723                          /*
 724  724                           * Somebody else got here first e.g the zone going
 725  725                           * away.
 726  726                           */
 727  727                          mutex_exit(&zone->zone_lock);
 728  728                          continue;
 729  729                  }
 730  730                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731  731                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732  732                  if (del->zsd_shutdown != NULL &&
 733  733                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734  734                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735  735                          DTRACE_PROBE2(zsd__shutdown__needed,
 736  736                              zone_t *, zone, zone_key_t, key);
 737  737                  }
 738  738                  if (del->zsd_destroy != NULL &&
 739  739                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740  740                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741  741                          DTRACE_PROBE2(zsd__destroy__needed,
 742  742                              zone_t *, zone, zone_key_t, key);
 743  743                  }
 744  744                  mutex_exit(&zone->zone_lock);
 745  745          }
 746  746          mutex_exit(&zonehash_lock);
 747  747          kmem_free(zsdp, sizeof (*zsdp));
 748  748  
 749  749          /* Now call the shutdown and destroy callback for this key */
 750  750          zsd_apply_all_zones(zsd_apply_shutdown, key);
 751  751          zsd_apply_all_zones(zsd_apply_destroy, key);
 752  752  
 753  753          /* Now we can free up the zsdp structures in each zone */
 754  754          mutex_enter(&zonehash_lock);
 755  755          for (zone = list_head(&zone_active); zone != NULL;
 756  756              zone = list_next(&zone_active, zone)) {
 757  757                  struct zsd_entry *del;
 758  758  
 759  759                  mutex_enter(&zone->zone_lock);
 760  760                  del = zsd_find(&zone->zone_zsd, key);
 761  761                  if (del != NULL) {
 762  762                          list_remove(&zone->zone_zsd, del);
 763  763                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764  764                          kmem_free(del, sizeof (*del));
 765  765                  }
 766  766                  mutex_exit(&zone->zone_lock);
 767  767          }
 768  768          mutex_exit(&zonehash_lock);
 769  769  
 770  770          return (0);
 771  771  }
 772  772  
 773  773  /*
 774  774   * ZSD counterpart of pthread_setspecific().
 775  775   *
 776  776   * Since all zsd callbacks, including those with no create function,
 777  777   * have an entry in zone_zsd, if the key is registered it is part of
 778  778   * the zone_zsd list.
 779  779   * Return an error if the key wasn't registerd.
 780  780   */
 781  781  int
 782  782  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783  783  {
 784  784          struct zsd_entry *t;
 785  785  
 786  786          mutex_enter(&zone->zone_lock);
 787  787          t = zsd_find_mru(&zone->zone_zsd, key);
 788  788          if (t != NULL) {
 789  789                  /*
 790  790                   * Replace old value with new
 791  791                   */
 792  792                  t->zsd_data = (void *)data;
 793  793                  mutex_exit(&zone->zone_lock);
 794  794                  return (0);
 795  795          }
 796  796          mutex_exit(&zone->zone_lock);
 797  797          return (-1);
 798  798  }
 799  799  
 800  800  /*
 801  801   * ZSD counterpart of pthread_getspecific().
 802  802   */
 803  803  void *
 804  804  zone_getspecific(zone_key_t key, zone_t *zone)
 805  805  {
 806  806          struct zsd_entry *t;
 807  807          void *data;
 808  808  
 809  809          mutex_enter(&zone->zone_lock);
 810  810          t = zsd_find_mru(&zone->zone_zsd, key);
 811  811          data = (t == NULL ? NULL : t->zsd_data);
 812  812          mutex_exit(&zone->zone_lock);
 813  813          return (data);
 814  814  }
 815  815  
 816  816  /*
 817  817   * Function used to initialize a zone's list of ZSD callbacks and data
 818  818   * when the zone is being created.  The callbacks are initialized from
 819  819   * the template list (zsd_registered_keys). The constructor callback is
 820  820   * executed later (once the zone exists and with locks dropped).
 821  821   */
 822  822  static void
 823  823  zone_zsd_configure(zone_t *zone)
 824  824  {
 825  825          struct zsd_entry *zsdp;
 826  826          struct zsd_entry *t;
 827  827  
 828  828          ASSERT(MUTEX_HELD(&zonehash_lock));
 829  829          ASSERT(list_head(&zone->zone_zsd) == NULL);
 830  830          mutex_enter(&zone->zone_lock);
 831  831          mutex_enter(&zsd_key_lock);
 832  832          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833  833              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834  834                  /*
 835  835                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836  836                   * should not have added anything to it.
 837  837                   */
 838  838                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839  839  
 840  840                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841  841                  t->zsd_key = zsdp->zsd_key;
 842  842                  t->zsd_create = zsdp->zsd_create;
 843  843                  t->zsd_shutdown = zsdp->zsd_shutdown;
 844  844                  t->zsd_destroy = zsdp->zsd_destroy;
 845  845                  if (zsdp->zsd_create != NULL) {
 846  846                          t->zsd_flags = ZSD_CREATE_NEEDED;
 847  847                          DTRACE_PROBE2(zsd__create__needed,
 848  848                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849  849                  }
 850  850                  list_insert_tail(&zone->zone_zsd, t);
 851  851          }
 852  852          mutex_exit(&zsd_key_lock);
 853  853          mutex_exit(&zone->zone_lock);
 854  854  }
 855  855  
 856  856  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857  857  
 858  858  /*
 859  859   * Helper function to execute shutdown or destructor callbacks.
 860  860   */
 861  861  static void
 862  862  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863  863  {
 864  864          struct zsd_entry *t;
 865  865  
 866  866          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867  867          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868  868          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869  869  
 870  870          /*
 871  871           * Run the callback solely based on what is registered for the zone
 872  872           * in zone_zsd. The global list can change independently of this
 873  873           * as keys are registered and unregistered and we don't register new
 874  874           * callbacks for a zone that is in the process of going away.
 875  875           */
 876  876          mutex_enter(&zone->zone_lock);
 877  877          for (t = list_head(&zone->zone_zsd); t != NULL;
 878  878              t = list_next(&zone->zone_zsd, t)) {
 879  879                  zone_key_t key = t->zsd_key;
 880  880  
 881  881                  /* Skip if no callbacks registered */
 882  882  
 883  883                  if (ct == ZSD_SHUTDOWN) {
 884  884                          if (t->zsd_shutdown != NULL &&
 885  885                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886  886                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887  887                                  DTRACE_PROBE2(zsd__shutdown__needed,
 888  888                                      zone_t *, zone, zone_key_t, key);
 889  889                          }
 890  890                  } else {
 891  891                          if (t->zsd_destroy != NULL &&
 892  892                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893  893                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894  894                                  DTRACE_PROBE2(zsd__destroy__needed,
 895  895                                      zone_t *, zone, zone_key_t, key);
 896  896                          }
 897  897                  }
 898  898          }
 899  899          mutex_exit(&zone->zone_lock);
 900  900  
 901  901          /* Now call the shutdown and destroy callback for this key */
 902  902          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903  903          zsd_apply_all_keys(zsd_apply_destroy, zone);
 904  904  
 905  905  }
 906  906  
 907  907  /*
 908  908   * Called when the zone is going away; free ZSD-related memory, and
 909  909   * destroy the zone_zsd list.
 910  910   */
 911  911  static void
 912  912  zone_free_zsd(zone_t *zone)
 913  913  {
 914  914          struct zsd_entry *t, *next;
 915  915  
 916  916          /*
 917  917           * Free all the zsd_entry's we had on this zone.
 918  918           */
 919  919          mutex_enter(&zone->zone_lock);
 920  920          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921  921                  next = list_next(&zone->zone_zsd, t);
 922  922                  list_remove(&zone->zone_zsd, t);
 923  923                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924  924                  kmem_free(t, sizeof (*t));
 925  925          }
 926  926          list_destroy(&zone->zone_zsd);
 927  927          mutex_exit(&zone->zone_lock);
 928  928  
 929  929  }
 930  930  
 931  931  /*
 932  932   * Apply a function to all zones for particular key value.
 933  933   *
 934  934   * The applyfn has to drop zonehash_lock if it does some work, and
 935  935   * then reacquire it before it returns.
 936  936   * When the lock is dropped we don't follow list_next even
 937  937   * if it is possible to do so without any hazards. This is
 938  938   * because we want the design to allow for the list of zones
 939  939   * to change in any arbitrary way during the time the
 940  940   * lock was dropped.
 941  941   *
 942  942   * It is safe to restart the loop at list_head since the applyfn
 943  943   * changes the zsd_flags as it does work, so a subsequent
 944  944   * pass through will have no effect in applyfn, hence the loop will terminate
 945  945   * in at worst O(N^2).
 946  946   */
 947  947  static void
 948  948  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949  949  {
 950  950          zone_t *zone;
 951  951  
 952  952          mutex_enter(&zonehash_lock);
 953  953          zone = list_head(&zone_active);
 954  954          while (zone != NULL) {
 955  955                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956  956                          /* Lock dropped - restart at head */
 957  957                          zone = list_head(&zone_active);
 958  958                  } else {
 959  959                          zone = list_next(&zone_active, zone);
 960  960                  }
 961  961          }
 962  962          mutex_exit(&zonehash_lock);
 963  963  }
 964  964  
 965  965  /*
 966  966   * Apply a function to all keys for a particular zone.
 967  967   *
 968  968   * The applyfn has to drop zonehash_lock if it does some work, and
 969  969   * then reacquire it before it returns.
 970  970   * When the lock is dropped we don't follow list_next even
 971  971   * if it is possible to do so without any hazards. This is
 972  972   * because we want the design to allow for the list of zsd callbacks
 973  973   * to change in any arbitrary way during the time the
 974  974   * lock was dropped.
 975  975   *
 976  976   * It is safe to restart the loop at list_head since the applyfn
 977  977   * changes the zsd_flags as it does work, so a subsequent
 978  978   * pass through will have no effect in applyfn, hence the loop will terminate
 979  979   * in at worst O(N^2).
 980  980   */
 981  981  static void
 982  982  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983  983  {
 984  984          struct zsd_entry *t;
 985  985  
 986  986          mutex_enter(&zone->zone_lock);
 987  987          t = list_head(&zone->zone_zsd);
 988  988          while (t != NULL) {
 989  989                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990  990                          /* Lock dropped - restart at head */
 991  991                          t = list_head(&zone->zone_zsd);
 992  992                  } else {
 993  993                          t = list_next(&zone->zone_zsd, t);
 994  994                  }
 995  995          }
 996  996          mutex_exit(&zone->zone_lock);
 997  997  }
 998  998  
 999  999  /*
1000 1000   * Call the create function for the zone and key if CREATE_NEEDED
1001 1001   * is set.
1002 1002   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003 1003   * we wait for that thread to complete so that we can ensure that
1004 1004   * all the callbacks are done when we've looped over all zones/keys.
1005 1005   *
1006 1006   * When we call the create function, we drop the global held by the
1007 1007   * caller, and return true to tell the caller it needs to re-evalute the
1008 1008   * state.
1009 1009   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010 1010   * remains held on exit.
1011 1011   */
1012 1012  static boolean_t
1013 1013  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014 1014      zone_t *zone, zone_key_t key)
1015 1015  {
1016 1016          void *result;
1017 1017          struct zsd_entry *t;
1018 1018          boolean_t dropped;
1019 1019  
1020 1020          if (lockp != NULL) {
1021 1021                  ASSERT(MUTEX_HELD(lockp));
1022 1022          }
1023 1023          if (zone_lock_held) {
1024 1024                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1025 1025          } else {
1026 1026                  mutex_enter(&zone->zone_lock);
1027 1027          }
1028 1028  
1029 1029          t = zsd_find(&zone->zone_zsd, key);
1030 1030          if (t == NULL) {
1031 1031                  /*
1032 1032                   * Somebody else got here first e.g the zone going
1033 1033                   * away.
1034 1034                   */
1035 1035                  if (!zone_lock_held)
1036 1036                          mutex_exit(&zone->zone_lock);
1037 1037                  return (B_FALSE);
1038 1038          }
1039 1039          dropped = B_FALSE;
1040 1040          if (zsd_wait_for_inprogress(zone, t, lockp))
1041 1041                  dropped = B_TRUE;
1042 1042  
1043 1043          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044 1044                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045 1045                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046 1046                  DTRACE_PROBE2(zsd__create__inprogress,
1047 1047                      zone_t *, zone, zone_key_t, key);
1048 1048                  mutex_exit(&zone->zone_lock);
1049 1049                  if (lockp != NULL)
1050 1050                          mutex_exit(lockp);
1051 1051  
1052 1052                  dropped = B_TRUE;
1053 1053                  ASSERT(t->zsd_create != NULL);
1054 1054                  DTRACE_PROBE2(zsd__create__start,
1055 1055                      zone_t *, zone, zone_key_t, key);
1056 1056  
1057 1057                  result = (*t->zsd_create)(zone->zone_id);
1058 1058  
1059 1059                  DTRACE_PROBE2(zsd__create__end,
1060 1060                      zone_t *, zone, voidn *, result);
1061 1061  
1062 1062                  ASSERT(result != NULL);
1063 1063                  if (lockp != NULL)
1064 1064                          mutex_enter(lockp);
1065 1065                  mutex_enter(&zone->zone_lock);
1066 1066                  t->zsd_data = result;
1067 1067                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068 1068                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069 1069                  cv_broadcast(&t->zsd_cv);
1070 1070                  DTRACE_PROBE2(zsd__create__completed,
1071 1071                      zone_t *, zone, zone_key_t, key);
1072 1072          }
1073 1073          if (!zone_lock_held)
1074 1074                  mutex_exit(&zone->zone_lock);
1075 1075          return (dropped);
1076 1076  }
1077 1077  
1078 1078  /*
1079 1079   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080 1080   * is set.
1081 1081   * If some other thread gets here first and sets *_INPROGRESS, then
1082 1082   * we wait for that thread to complete so that we can ensure that
1083 1083   * all the callbacks are done when we've looped over all zones/keys.
1084 1084   *
1085 1085   * When we call the shutdown function, we drop the global held by the
1086 1086   * caller, and return true to tell the caller it needs to re-evalute the
1087 1087   * state.
1088 1088   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089 1089   * remains held on exit.
1090 1090   */
1091 1091  static boolean_t
1092 1092  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093 1093      zone_t *zone, zone_key_t key)
1094 1094  {
1095 1095          struct zsd_entry *t;
1096 1096          void *data;
1097 1097          boolean_t dropped;
1098 1098  
1099 1099          if (lockp != NULL) {
1100 1100                  ASSERT(MUTEX_HELD(lockp));
1101 1101          }
1102 1102          if (zone_lock_held) {
1103 1103                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1104 1104          } else {
1105 1105                  mutex_enter(&zone->zone_lock);
1106 1106          }
1107 1107  
1108 1108          t = zsd_find(&zone->zone_zsd, key);
1109 1109          if (t == NULL) {
1110 1110                  /*
1111 1111                   * Somebody else got here first e.g the zone going
1112 1112                   * away.
1113 1113                   */
1114 1114                  if (!zone_lock_held)
1115 1115                          mutex_exit(&zone->zone_lock);
1116 1116                  return (B_FALSE);
1117 1117          }
1118 1118          dropped = B_FALSE;
1119 1119          if (zsd_wait_for_creator(zone, t, lockp))
1120 1120                  dropped = B_TRUE;
1121 1121  
1122 1122          if (zsd_wait_for_inprogress(zone, t, lockp))
1123 1123                  dropped = B_TRUE;
1124 1124  
1125 1125          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126 1126                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127 1127                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128 1128                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1129 1129                      zone_t *, zone, zone_key_t, key);
1130 1130                  mutex_exit(&zone->zone_lock);
1131 1131                  if (lockp != NULL)
1132 1132                          mutex_exit(lockp);
1133 1133                  dropped = B_TRUE;
1134 1134  
1135 1135                  ASSERT(t->zsd_shutdown != NULL);
1136 1136                  data = t->zsd_data;
1137 1137  
1138 1138                  DTRACE_PROBE2(zsd__shutdown__start,
1139 1139                      zone_t *, zone, zone_key_t, key);
1140 1140  
1141 1141                  (t->zsd_shutdown)(zone->zone_id, data);
1142 1142                  DTRACE_PROBE2(zsd__shutdown__end,
1143 1143                      zone_t *, zone, zone_key_t, key);
1144 1144  
1145 1145                  if (lockp != NULL)
1146 1146                          mutex_enter(lockp);
1147 1147                  mutex_enter(&zone->zone_lock);
1148 1148                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149 1149                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150 1150                  cv_broadcast(&t->zsd_cv);
1151 1151                  DTRACE_PROBE2(zsd__shutdown__completed,
1152 1152                      zone_t *, zone, zone_key_t, key);
1153 1153          }
1154 1154          if (!zone_lock_held)
1155 1155                  mutex_exit(&zone->zone_lock);
1156 1156          return (dropped);
1157 1157  }
1158 1158  
1159 1159  /*
1160 1160   * Call the destroy function for the zone and key if DESTROY_NEEDED
1161 1161   * is set.
1162 1162   * If some other thread gets here first and sets *_INPROGRESS, then
1163 1163   * we wait for that thread to complete so that we can ensure that
1164 1164   * all the callbacks are done when we've looped over all zones/keys.
1165 1165   *
1166 1166   * When we call the destroy function, we drop the global held by the
1167 1167   * caller, and return true to tell the caller it needs to re-evalute the
1168 1168   * state.
1169 1169   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170 1170   * remains held on exit.
1171 1171   */
1172 1172  static boolean_t
1173 1173  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174 1174      zone_t *zone, zone_key_t key)
1175 1175  {
1176 1176          struct zsd_entry *t;
1177 1177          void *data;
1178 1178          boolean_t dropped;
1179 1179  
1180 1180          if (lockp != NULL) {
1181 1181                  ASSERT(MUTEX_HELD(lockp));
1182 1182          }
1183 1183          if (zone_lock_held) {
1184 1184                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1185 1185          } else {
1186 1186                  mutex_enter(&zone->zone_lock);
1187 1187          }
1188 1188  
1189 1189          t = zsd_find(&zone->zone_zsd, key);
1190 1190          if (t == NULL) {
1191 1191                  /*
1192 1192                   * Somebody else got here first e.g the zone going
1193 1193                   * away.
1194 1194                   */
1195 1195                  if (!zone_lock_held)
1196 1196                          mutex_exit(&zone->zone_lock);
1197 1197                  return (B_FALSE);
1198 1198          }
1199 1199          dropped = B_FALSE;
1200 1200          if (zsd_wait_for_creator(zone, t, lockp))
1201 1201                  dropped = B_TRUE;
1202 1202  
1203 1203          if (zsd_wait_for_inprogress(zone, t, lockp))
1204 1204                  dropped = B_TRUE;
1205 1205  
1206 1206          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207 1207                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208 1208                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209 1209                  DTRACE_PROBE2(zsd__destroy__inprogress,
1210 1210                      zone_t *, zone, zone_key_t, key);
1211 1211                  mutex_exit(&zone->zone_lock);
1212 1212                  if (lockp != NULL)
1213 1213                          mutex_exit(lockp);
1214 1214                  dropped = B_TRUE;
1215 1215  
1216 1216                  ASSERT(t->zsd_destroy != NULL);
1217 1217                  data = t->zsd_data;
1218 1218                  DTRACE_PROBE2(zsd__destroy__start,
1219 1219                      zone_t *, zone, zone_key_t, key);
1220 1220  
1221 1221                  (t->zsd_destroy)(zone->zone_id, data);
1222 1222                  DTRACE_PROBE2(zsd__destroy__end,
1223 1223                      zone_t *, zone, zone_key_t, key);
1224 1224  
1225 1225                  if (lockp != NULL)
1226 1226                          mutex_enter(lockp);
1227 1227                  mutex_enter(&zone->zone_lock);
1228 1228                  t->zsd_data = NULL;
1229 1229                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230 1230                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231 1231                  cv_broadcast(&t->zsd_cv);
1232 1232                  DTRACE_PROBE2(zsd__destroy__completed,
1233 1233                      zone_t *, zone, zone_key_t, key);
1234 1234          }
1235 1235          if (!zone_lock_held)
1236 1236                  mutex_exit(&zone->zone_lock);
1237 1237          return (dropped);
1238 1238  }
1239 1239  
1240 1240  /*
1241 1241   * Wait for any CREATE_NEEDED flag to be cleared.
1242 1242   * Returns true if lockp was temporarily dropped while waiting.
1243 1243   */
1244 1244  static boolean_t
1245 1245  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 1246  {
1247 1247          boolean_t dropped = B_FALSE;
1248 1248  
1249 1249          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250 1250                  DTRACE_PROBE2(zsd__wait__for__creator,
1251 1251                      zone_t *, zone, struct zsd_entry *, t);
1252 1252                  if (lockp != NULL) {
1253 1253                          dropped = B_TRUE;
1254 1254                          mutex_exit(lockp);
1255 1255                  }
1256 1256                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1257 1257                  if (lockp != NULL) {
1258 1258                          /* First drop zone_lock to preserve order */
1259 1259                          mutex_exit(&zone->zone_lock);
1260 1260                          mutex_enter(lockp);
1261 1261                          mutex_enter(&zone->zone_lock);
1262 1262                  }
1263 1263          }
1264 1264          return (dropped);
1265 1265  }
1266 1266  
1267 1267  /*
1268 1268   * Wait for any INPROGRESS flag to be cleared.
1269 1269   * Returns true if lockp was temporarily dropped while waiting.
1270 1270   */
1271 1271  static boolean_t
1272 1272  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 1273  {
1274 1274          boolean_t dropped = B_FALSE;
1275 1275  
1276 1276          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277 1277                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1278 1278                      zone_t *, zone, struct zsd_entry *, t);
1279 1279                  if (lockp != NULL) {
1280 1280                          dropped = B_TRUE;
1281 1281                          mutex_exit(lockp);
1282 1282                  }
1283 1283                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1284 1284                  if (lockp != NULL) {
1285 1285                          /* First drop zone_lock to preserve order */
1286 1286                          mutex_exit(&zone->zone_lock);
1287 1287                          mutex_enter(lockp);
1288 1288                          mutex_enter(&zone->zone_lock);
1289 1289                  }
1290 1290          }
1291 1291          return (dropped);
1292 1292  }
1293 1293  
1294 1294  /*
1295 1295   * Frees memory associated with the zone dataset list.
1296 1296   */
1297 1297  static void
1298 1298  zone_free_datasets(zone_t *zone)
1299 1299  {
1300 1300          zone_dataset_t *t, *next;
1301 1301  
1302 1302          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303 1303                  next = list_next(&zone->zone_datasets, t);
1304 1304                  list_remove(&zone->zone_datasets, t);
1305 1305                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306 1306                  kmem_free(t, sizeof (*t));
1307 1307          }
1308 1308          list_destroy(&zone->zone_datasets);
1309 1309  }
1310 1310  
1311 1311  /*
1312 1312   * zone.cpu-shares resource control support.
1313 1313   */
1314 1314  /*ARGSUSED*/
1315 1315  static rctl_qty_t
1316 1316  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 1317  {
1318 1318          ASSERT(MUTEX_HELD(&p->p_lock));
1319 1319          return (p->p_zone->zone_shares);
1320 1320  }
1321 1321  
1322 1322  /*ARGSUSED*/
1323 1323  static int
1324 1324  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325 1325      rctl_qty_t nv)
1326 1326  {
1327 1327          ASSERT(MUTEX_HELD(&p->p_lock));
1328 1328          ASSERT(e->rcep_t == RCENTITY_ZONE);
1329 1329          if (e->rcep_p.zone == NULL)
1330 1330                  return (0);
1331 1331  
1332 1332          e->rcep_p.zone->zone_shares = nv;
1333 1333          return (0);
1334 1334  }
1335 1335  
1336 1336  static rctl_ops_t zone_cpu_shares_ops = {
1337 1337          rcop_no_action,
1338 1338          zone_cpu_shares_usage,
1339 1339          zone_cpu_shares_set,
1340 1340          rcop_no_test
1341 1341  };
1342 1342  
1343 1343  /*
1344 1344   * zone.cpu-cap resource control support.
1345 1345   */
1346 1346  /*ARGSUSED*/
1347 1347  static rctl_qty_t
1348 1348  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 1349  {
1350 1350          ASSERT(MUTEX_HELD(&p->p_lock));
1351 1351          return (cpucaps_zone_get(p->p_zone));
1352 1352  }
1353 1353  
1354 1354  /*ARGSUSED*/
1355 1355  static int
1356 1356  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357 1357      rctl_qty_t nv)
1358 1358  {
1359 1359          zone_t *zone = e->rcep_p.zone;
1360 1360  
1361 1361          ASSERT(MUTEX_HELD(&p->p_lock));
1362 1362          ASSERT(e->rcep_t == RCENTITY_ZONE);
1363 1363  
1364 1364          if (zone == NULL)
1365 1365                  return (0);
1366 1366  
1367 1367          /*
1368 1368           * set cap to the new value.
1369 1369           */
1370 1370          return (cpucaps_zone_set(zone, nv));
1371 1371  }
1372 1372  
1373 1373  static rctl_ops_t zone_cpu_cap_ops = {
1374 1374          rcop_no_action,
1375 1375          zone_cpu_cap_get,
1376 1376          zone_cpu_cap_set,
1377 1377          rcop_no_test
1378 1378  };
1379 1379  
1380 1380  /*ARGSUSED*/
1381 1381  static rctl_qty_t
1382 1382  zone_lwps_usage(rctl_t *r, proc_t *p)
1383 1383  {
1384 1384          rctl_qty_t nlwps;
1385 1385          zone_t *zone = p->p_zone;
1386 1386  
1387 1387          ASSERT(MUTEX_HELD(&p->p_lock));
1388 1388  
1389 1389          mutex_enter(&zone->zone_nlwps_lock);
1390 1390          nlwps = zone->zone_nlwps;
1391 1391          mutex_exit(&zone->zone_nlwps_lock);
1392 1392  
1393 1393          return (nlwps);
1394 1394  }
1395 1395  
1396 1396  /*ARGSUSED*/
1397 1397  static int
1398 1398  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399 1399      rctl_qty_t incr, uint_t flags)
1400 1400  {
1401 1401          rctl_qty_t nlwps;
1402 1402  
1403 1403          ASSERT(MUTEX_HELD(&p->p_lock));
1404 1404          ASSERT(e->rcep_t == RCENTITY_ZONE);
1405 1405          if (e->rcep_p.zone == NULL)
1406 1406                  return (0);
1407 1407          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408 1408          nlwps = e->rcep_p.zone->zone_nlwps;
1409 1409  
1410 1410          if (nlwps + incr > rcntl->rcv_value)
1411 1411                  return (1);
1412 1412  
1413 1413          return (0);
1414 1414  }
1415 1415  
1416 1416  /*ARGSUSED*/
1417 1417  static int
1418 1418  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 1419  {
1420 1420          ASSERT(MUTEX_HELD(&p->p_lock));
1421 1421          ASSERT(e->rcep_t == RCENTITY_ZONE);
1422 1422          if (e->rcep_p.zone == NULL)
1423 1423                  return (0);
1424 1424          e->rcep_p.zone->zone_nlwps_ctl = nv;
1425 1425          return (0);
1426 1426  }
1427 1427  
1428 1428  static rctl_ops_t zone_lwps_ops = {
1429 1429          rcop_no_action,
1430 1430          zone_lwps_usage,
1431 1431          zone_lwps_set,
1432 1432          zone_lwps_test,
1433 1433  };
1434 1434  
1435 1435  /*ARGSUSED*/
1436 1436  static rctl_qty_t
1437 1437  zone_procs_usage(rctl_t *r, proc_t *p)
1438 1438  {
1439 1439          rctl_qty_t nprocs;
1440 1440          zone_t *zone = p->p_zone;
1441 1441  
1442 1442          ASSERT(MUTEX_HELD(&p->p_lock));
1443 1443  
1444 1444          mutex_enter(&zone->zone_nlwps_lock);
1445 1445          nprocs = zone->zone_nprocs;
1446 1446          mutex_exit(&zone->zone_nlwps_lock);
1447 1447  
1448 1448          return (nprocs);
1449 1449  }
1450 1450  
1451 1451  /*ARGSUSED*/
1452 1452  static int
1453 1453  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454 1454      rctl_qty_t incr, uint_t flags)
1455 1455  {
1456 1456          rctl_qty_t nprocs;
1457 1457  
1458 1458          ASSERT(MUTEX_HELD(&p->p_lock));
1459 1459          ASSERT(e->rcep_t == RCENTITY_ZONE);
1460 1460          if (e->rcep_p.zone == NULL)
1461 1461                  return (0);
1462 1462          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463 1463          nprocs = e->rcep_p.zone->zone_nprocs;
1464 1464  
1465 1465          if (nprocs + incr > rcntl->rcv_value)
1466 1466                  return (1);
1467 1467  
1468 1468          return (0);
1469 1469  }
1470 1470  
1471 1471  /*ARGSUSED*/
1472 1472  static int
1473 1473  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 1474  {
1475 1475          ASSERT(MUTEX_HELD(&p->p_lock));
1476 1476          ASSERT(e->rcep_t == RCENTITY_ZONE);
1477 1477          if (e->rcep_p.zone == NULL)
1478 1478                  return (0);
1479 1479          e->rcep_p.zone->zone_nprocs_ctl = nv;
1480 1480          return (0);
1481 1481  }
1482 1482  
1483 1483  static rctl_ops_t zone_procs_ops = {
1484 1484          rcop_no_action,
1485 1485          zone_procs_usage,
1486 1486          zone_procs_set,
1487 1487          zone_procs_test,
1488 1488  };
1489 1489  
1490 1490  /*ARGSUSED*/
1491 1491  static rctl_qty_t
1492 1492  zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 1493  {
1494 1494          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1495          return (p->p_zone->zone_shmmax);
1496 1496  }
1497 1497  
1498 1498  /*ARGSUSED*/
1499 1499  static int
1500 1500  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501 1501      rctl_qty_t incr, uint_t flags)
1502 1502  {
1503 1503          rctl_qty_t v;
1504 1504          ASSERT(MUTEX_HELD(&p->p_lock));
1505 1505          ASSERT(e->rcep_t == RCENTITY_ZONE);
1506 1506          v = e->rcep_p.zone->zone_shmmax + incr;
1507 1507          if (v > rval->rcv_value)
1508 1508                  return (1);
1509 1509          return (0);
1510 1510  }
1511 1511  
1512 1512  static rctl_ops_t zone_shmmax_ops = {
1513 1513          rcop_no_action,
1514 1514          zone_shmmax_usage,
1515 1515          rcop_no_set,
1516 1516          zone_shmmax_test
1517 1517  };
1518 1518  
1519 1519  /*ARGSUSED*/
1520 1520  static rctl_qty_t
1521 1521  zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 1522  {
1523 1523          ASSERT(MUTEX_HELD(&p->p_lock));
1524 1524          return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 1525  }
1526 1526  
1527 1527  /*ARGSUSED*/
1528 1528  static int
1529 1529  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530 1530      rctl_qty_t incr, uint_t flags)
1531 1531  {
1532 1532          rctl_qty_t v;
1533 1533          ASSERT(MUTEX_HELD(&p->p_lock));
1534 1534          ASSERT(e->rcep_t == RCENTITY_ZONE);
1535 1535          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536 1536          if (v > rval->rcv_value)
1537 1537                  return (1);
1538 1538          return (0);
1539 1539  }
1540 1540  
1541 1541  static rctl_ops_t zone_shmmni_ops = {
1542 1542          rcop_no_action,
1543 1543          zone_shmmni_usage,
1544 1544          rcop_no_set,
1545 1545          zone_shmmni_test
1546 1546  };
1547 1547  
1548 1548  /*ARGSUSED*/
1549 1549  static rctl_qty_t
1550 1550  zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 1551  {
1552 1552          ASSERT(MUTEX_HELD(&p->p_lock));
1553 1553          return (p->p_zone->zone_ipc.ipcq_semmni);
1554 1554  }
1555 1555  
1556 1556  /*ARGSUSED*/
1557 1557  static int
1558 1558  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559 1559      rctl_qty_t incr, uint_t flags)
1560 1560  {
1561 1561          rctl_qty_t v;
1562 1562          ASSERT(MUTEX_HELD(&p->p_lock));
1563 1563          ASSERT(e->rcep_t == RCENTITY_ZONE);
1564 1564          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565 1565          if (v > rval->rcv_value)
1566 1566                  return (1);
1567 1567          return (0);
1568 1568  }
1569 1569  
1570 1570  static rctl_ops_t zone_semmni_ops = {
1571 1571          rcop_no_action,
1572 1572          zone_semmni_usage,
1573 1573          rcop_no_set,
1574 1574          zone_semmni_test
1575 1575  };
1576 1576  
1577 1577  /*ARGSUSED*/
1578 1578  static rctl_qty_t
1579 1579  zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 1580  {
1581 1581          ASSERT(MUTEX_HELD(&p->p_lock));
1582 1582          return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 1583  }
1584 1584  
1585 1585  /*ARGSUSED*/
1586 1586  static int
1587 1587  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588 1588      rctl_qty_t incr, uint_t flags)
1589 1589  {
1590 1590          rctl_qty_t v;
1591 1591          ASSERT(MUTEX_HELD(&p->p_lock));
1592 1592          ASSERT(e->rcep_t == RCENTITY_ZONE);
1593 1593          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594 1594          if (v > rval->rcv_value)
1595 1595                  return (1);
1596 1596          return (0);
1597 1597  }
1598 1598  
1599 1599  static rctl_ops_t zone_msgmni_ops = {
1600 1600          rcop_no_action,
1601 1601          zone_msgmni_usage,
1602 1602          rcop_no_set,
1603 1603          zone_msgmni_test
1604 1604  };
1605 1605  
1606 1606  /*ARGSUSED*/
1607 1607  static rctl_qty_t
1608 1608  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 1609  {
1610 1610          rctl_qty_t q;
1611 1611          ASSERT(MUTEX_HELD(&p->p_lock));
1612 1612          mutex_enter(&p->p_zone->zone_mem_lock);
1613 1613          q = p->p_zone->zone_locked_mem;
1614 1614          mutex_exit(&p->p_zone->zone_mem_lock);
1615 1615          return (q);
1616 1616  }
1617 1617  
1618 1618  /*ARGSUSED*/
1619 1619  static int
1620 1620  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621 1621      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 1622  {
1623 1623          rctl_qty_t q;
1624 1624          zone_t *z;
1625 1625  
1626 1626          z = e->rcep_p.zone;
1627 1627          ASSERT(MUTEX_HELD(&p->p_lock));
1628 1628          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629 1629          q = z->zone_locked_mem;
1630 1630          if (q + incr > rcntl->rcv_value)
1631 1631                  return (1);
1632 1632          return (0);
1633 1633  }
1634 1634  
1635 1635  /*ARGSUSED*/
1636 1636  static int
1637 1637  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638 1638      rctl_qty_t nv)
1639 1639  {
1640 1640          ASSERT(MUTEX_HELD(&p->p_lock));
1641 1641          ASSERT(e->rcep_t == RCENTITY_ZONE);
1642 1642          if (e->rcep_p.zone == NULL)
1643 1643                  return (0);
1644 1644          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645 1645          return (0);
1646 1646  }
1647 1647  
1648 1648  static rctl_ops_t zone_locked_mem_ops = {
1649 1649          rcop_no_action,
1650 1650          zone_locked_mem_usage,
1651 1651          zone_locked_mem_set,
1652 1652          zone_locked_mem_test
1653 1653  };
1654 1654  
1655 1655  /*ARGSUSED*/
1656 1656  static rctl_qty_t
1657 1657  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 1658  {
1659 1659          rctl_qty_t q;
1660 1660          zone_t *z = p->p_zone;
1661 1661  
1662 1662          ASSERT(MUTEX_HELD(&p->p_lock));
1663 1663          mutex_enter(&z->zone_mem_lock);
1664 1664          q = z->zone_max_swap;
1665 1665          mutex_exit(&z->zone_mem_lock);
1666 1666          return (q);
1667 1667  }
1668 1668  
1669 1669  /*ARGSUSED*/
1670 1670  static int
1671 1671  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672 1672      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 1673  {
1674 1674          rctl_qty_t q;
1675 1675          zone_t *z;
1676 1676  
1677 1677          z = e->rcep_p.zone;
1678 1678          ASSERT(MUTEX_HELD(&p->p_lock));
1679 1679          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680 1680          q = z->zone_max_swap;
1681 1681          if (q + incr > rcntl->rcv_value)
1682 1682                  return (1);
1683 1683          return (0);
1684 1684  }
1685 1685  
1686 1686  /*ARGSUSED*/
1687 1687  static int
1688 1688  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689 1689      rctl_qty_t nv)
1690 1690  {
1691 1691          ASSERT(MUTEX_HELD(&p->p_lock));
1692 1692          ASSERT(e->rcep_t == RCENTITY_ZONE);
1693 1693          if (e->rcep_p.zone == NULL)
1694 1694                  return (0);
1695 1695          e->rcep_p.zone->zone_max_swap_ctl = nv;
1696 1696          return (0);
1697 1697  }
1698 1698  
1699 1699  static rctl_ops_t zone_max_swap_ops = {
1700 1700          rcop_no_action,
1701 1701          zone_max_swap_usage,
1702 1702          zone_max_swap_set,
1703 1703          zone_max_swap_test
1704 1704  };
1705 1705  
1706 1706  /*ARGSUSED*/
1707 1707  static rctl_qty_t
1708 1708  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 1709  {
1710 1710          rctl_qty_t q;
1711 1711          zone_t *z = p->p_zone;
1712 1712  
1713 1713          ASSERT(MUTEX_HELD(&p->p_lock));
1714 1714          mutex_enter(&z->zone_rctl_lock);
1715 1715          q = z->zone_max_lofi;
1716 1716          mutex_exit(&z->zone_rctl_lock);
1717 1717          return (q);
1718 1718  }
1719 1719  
1720 1720  /*ARGSUSED*/
1721 1721  static int
1722 1722  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723 1723      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 1724  {
1725 1725          rctl_qty_t q;
1726 1726          zone_t *z;
1727 1727  
1728 1728          z = e->rcep_p.zone;
1729 1729          ASSERT(MUTEX_HELD(&p->p_lock));
1730 1730          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731 1731          q = z->zone_max_lofi;
1732 1732          if (q + incr > rcntl->rcv_value)
1733 1733                  return (1);
1734 1734          return (0);
1735 1735  }
1736 1736  
1737 1737  /*ARGSUSED*/
1738 1738  static int
1739 1739  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740 1740      rctl_qty_t nv)
1741 1741  {
1742 1742          ASSERT(MUTEX_HELD(&p->p_lock));
1743 1743          ASSERT(e->rcep_t == RCENTITY_ZONE);
1744 1744          if (e->rcep_p.zone == NULL)
1745 1745                  return (0);
1746 1746          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747 1747          return (0);
1748 1748  }
1749 1749  
1750 1750  static rctl_ops_t zone_max_lofi_ops = {
1751 1751          rcop_no_action,
1752 1752          zone_max_lofi_usage,
1753 1753          zone_max_lofi_set,
1754 1754          zone_max_lofi_test
1755 1755  };
1756 1756  
1757 1757  /*
1758 1758   * Helper function to brand the zone with a unique ID.
1759 1759   */
1760 1760  static void
1761 1761  zone_uniqid(zone_t *zone)
1762 1762  {
1763 1763          static uint64_t uniqid = 0;
1764 1764  
1765 1765          ASSERT(MUTEX_HELD(&zonehash_lock));
1766 1766          zone->zone_uniqid = uniqid++;
1767 1767  }
1768 1768  
1769 1769  /*
1770 1770   * Returns a held pointer to the "kcred" for the specified zone.
1771 1771   */
1772 1772  struct cred *
1773 1773  zone_get_kcred(zoneid_t zoneid)
1774 1774  {
1775 1775          zone_t *zone;
1776 1776          cred_t *cr;
1777 1777  
1778 1778          if ((zone = zone_find_by_id(zoneid)) == NULL)
1779 1779                  return (NULL);
1780 1780          cr = zone->zone_kcred;
1781 1781          crhold(cr);
1782 1782          zone_rele(zone);
1783 1783          return (cr);
1784 1784  }
1785 1785  
1786 1786  static int
1787 1787  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 1788  {
1789 1789          zone_t *zone = ksp->ks_private;
1790 1790          zone_kstat_t *zk = ksp->ks_data;
1791 1791  
1792 1792          if (rw == KSTAT_WRITE)
1793 1793                  return (EACCES);
1794 1794  
1795 1795          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796 1796          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797 1797          return (0);
1798 1798  }
1799 1799  
1800 1800  static int
1801 1801  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 1802  {
1803 1803          zone_t *zone = ksp->ks_private;
1804 1804          zone_kstat_t *zk = ksp->ks_data;
1805 1805  
1806 1806          if (rw == KSTAT_WRITE)
1807 1807                  return (EACCES);
1808 1808  
1809 1809          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810 1810          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811 1811          return (0);
1812 1812  }
1813 1813  
1814 1814  static int
1815 1815  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 1816  {
1817 1817          zone_t *zone = ksp->ks_private;
1818 1818          zone_kstat_t *zk = ksp->ks_data;
1819 1819  
1820 1820          if (rw == KSTAT_WRITE)
1821 1821                  return (EACCES);
1822 1822  
1823 1823          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824 1824          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825 1825          return (0);
1826 1826  }
1827 1827  
1828 1828  static kstat_t *
1829 1829  zone_kstat_create_common(zone_t *zone, char *name,
1830 1830      int (*updatefunc) (kstat_t *, int))
1831 1831  {
1832 1832          kstat_t *ksp;
1833 1833          zone_kstat_t *zk;
1834 1834  
1835 1835          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836 1836              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837 1837              KSTAT_FLAG_VIRTUAL);
1838 1838  
1839 1839          if (ksp == NULL)
1840 1840                  return (NULL);
1841 1841  
1842 1842          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843 1843          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844 1844          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845 1845          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846 1846          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847 1847          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848 1848          ksp->ks_update = updatefunc;
1849 1849          ksp->ks_private = zone;
1850 1850          kstat_install(ksp);
1851 1851          return (ksp);
1852 1852  }
1853 1853  
1854 1854  
1855 1855  static int
1856 1856  zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 1857  {
1858 1858          zone_t *zone = ksp->ks_private;
1859 1859          zone_mcap_kstat_t *zmp = ksp->ks_data;
1860 1860  
1861 1861          if (rw == KSTAT_WRITE)
1862 1862                  return (EACCES);
1863 1863  
1864 1864          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865 1865          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866 1866          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867 1867          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868 1868          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869 1869  
1870 1870          return (0);
1871 1871  }
1872 1872  
1873 1873  static kstat_t *
1874 1874  zone_mcap_kstat_create(zone_t *zone)
1875 1875  {
1876 1876          kstat_t *ksp;
1877 1877          zone_mcap_kstat_t *zmp;
1878 1878  
1879 1879          if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880 1880              zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881 1881              sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882 1882              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883 1883                  return (NULL);
1884 1884  
1885 1885          if (zone->zone_id != GLOBAL_ZONEID)
1886 1886                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1887 1887  
1888 1888          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889 1889          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890 1890          ksp->ks_lock = &zone->zone_mcap_lock;
1891 1891          zone->zone_mcap_stats = zmp;
1892 1892  
1893 1893          /* The kstat "name" field is not large enough for a full zonename */
1894 1894          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895 1895          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896 1896          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897 1897          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898 1898          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899 1899          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900 1900          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901 1901              KSTAT_DATA_UINT64);
1902 1902  
1903 1903          ksp->ks_update = zone_mcap_kstat_update;
1904 1904          ksp->ks_private = zone;

↓ open down ↓

1721 lines elided

↑ open up ↑

1905 1905  
1906 1906          kstat_install(ksp);
1907 1907          return (ksp);
1908 1908  }
1909 1909  
1910 1910  static int
1911 1911  zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 1912  {
1913 1913          zone_t *zone = ksp->ks_private;
1914 1914          zone_misc_kstat_t *zmp = ksp->ks_data;
1915      -        hrtime_t tmp;
     1915 +        hrtime_t hrtime;
     1916 +        uint64_t tmp;
1916 1917  
1917 1918          if (rw == KSTAT_WRITE)
1918 1919                  return (EACCES);
1919 1920  
1920      -        tmp = zone->zone_utime;
1921      -        scalehrtime(&tmp);
1922      -        zmp->zm_utime.value.ui64 = tmp;
1923      -        tmp = zone->zone_stime;
1924      -        scalehrtime(&tmp);
1925      -        zmp->zm_stime.value.ui64 = tmp;
1926      -        tmp = zone->zone_wtime;
1927      -        scalehrtime(&tmp);
1928      -        zmp->zm_wtime.value.ui64 = tmp;
     1921 +        tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
     1922 +        hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
     1923 +        scalehrtime(&hrtime);
     1924 +        zmp->zm_stime.value.ui64 = hrtime;
1929 1925  
     1926 +        tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
     1927 +        hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
     1928 +        scalehrtime(&hrtime);
     1929 +        zmp->zm_utime.value.ui64 = hrtime;
     1930 +
     1931 +        tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
     1932 +        hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
     1933 +        scalehrtime(&hrtime);
     1934 +        zmp->zm_wtime.value.ui64 = hrtime;
     1935 +
1930 1936          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1931 1937          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1932 1938          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1933 1939  
1934 1940          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1935 1941          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1936 1942          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1937 1943          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1938 1944  
1939 1945          zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;

1940 1946  
1941 1947          zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1942 1948          zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1943 1949  
1944 1950          return (0);
1945 1951  }
1946 1952  
1947 1953  static kstat_t *
1948 1954  zone_misc_kstat_create(zone_t *zone)
1949 1955  {
1950 1956          kstat_t *ksp;
1951 1957          zone_misc_kstat_t *zmp;
1952 1958  
1953 1959          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1954 1960              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1955 1961              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1956 1962              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1957 1963                  return (NULL);
1958 1964  
1959 1965          if (zone->zone_id != GLOBAL_ZONEID)
1960 1966                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1961 1967  
1962 1968          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1963 1969          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1964 1970          ksp->ks_lock = &zone->zone_misc_lock;
1965 1971          zone->zone_misc_stats = zmp;
1966 1972  
1967 1973          /* The kstat "name" field is not large enough for a full zonename */
1968 1974          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1969 1975          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1970 1976          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1971 1977          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1972 1978          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1973 1979          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1974 1980          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1975 1981          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1976 1982              KSTAT_DATA_UINT32);
1977 1983          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1978 1984          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1979 1985              KSTAT_DATA_UINT32);
1980 1986          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1981 1987          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1982 1988          kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1983 1989              KSTAT_DATA_UINT32);
1984 1990          kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1985 1991          kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1986 1992  
1987 1993          ksp->ks_update = zone_misc_kstat_update;
1988 1994          ksp->ks_private = zone;
1989 1995  
1990 1996          kstat_install(ksp);
1991 1997          return (ksp);
1992 1998  }
1993 1999  
1994 2000  static void
1995 2001  zone_kstat_create(zone_t *zone)
1996 2002  {
1997 2003          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1998 2004              "lockedmem", zone_lockedmem_kstat_update);
1999 2005          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2000 2006              "swapresv", zone_swapresv_kstat_update);
2001 2007          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2002 2008              "nprocs", zone_nprocs_kstat_update);
2003 2009  
2004 2010          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2005 2011                  zone->zone_mcap_stats = kmem_zalloc(
2006 2012                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
2007 2013          }
2008 2014  
2009 2015          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2010 2016                  zone->zone_misc_stats = kmem_zalloc(
2011 2017                      sizeof (zone_misc_kstat_t), KM_SLEEP);
2012 2018          }
2013 2019  }
2014 2020  
2015 2021  static void
2016 2022  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2017 2023  {
2018 2024          void *data;
2019 2025  
2020 2026          if (*pkstat != NULL) {
2021 2027                  data = (*pkstat)->ks_data;
2022 2028                  kstat_delete(*pkstat);
2023 2029                  kmem_free(data, datasz);
2024 2030                  *pkstat = NULL;
2025 2031          }
2026 2032  }
2027 2033  
2028 2034  static void
2029 2035  zone_kstat_delete(zone_t *zone)
2030 2036  {
2031 2037          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2032 2038              sizeof (zone_kstat_t));
2033 2039          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2034 2040              sizeof (zone_kstat_t));
2035 2041          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2036 2042              sizeof (zone_kstat_t));
2037 2043          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2038 2044              sizeof (zone_mcap_kstat_t));
2039 2045          zone_kstat_delete_common(&zone->zone_misc_ksp,
2040 2046              sizeof (zone_misc_kstat_t));
2041 2047  }
2042 2048  
2043 2049  /*
2044 2050   * Called very early on in boot to initialize the ZSD list so that
2045 2051   * zone_key_create() can be called before zone_init().  It also initializes
2046 2052   * portions of zone0 which may be used before zone_init() is called.  The
2047 2053   * variable "global_zone" will be set when zone0 is fully initialized by
2048 2054   * zone_init().
2049 2055   */
2050 2056  void
2051 2057  zone_zsd_init(void)
2052 2058  {
2053 2059          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2054 2060          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2055 2061          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2056 2062              offsetof(struct zsd_entry, zsd_linkage));
2057 2063          list_create(&zone_active, sizeof (zone_t),
2058 2064              offsetof(zone_t, zone_linkage));
2059 2065          list_create(&zone_deathrow, sizeof (zone_t),
2060 2066              offsetof(zone_t, zone_linkage));
2061 2067  
2062 2068          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2063 2069          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2064 2070          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2065 2071          zone0.zone_shares = 1;
2066 2072          zone0.zone_nlwps = 0;
2067 2073          zone0.zone_nlwps_ctl = INT_MAX;
2068 2074          zone0.zone_nprocs = 0;
2069 2075          zone0.zone_nprocs_ctl = INT_MAX;
2070 2076          zone0.zone_locked_mem = 0;
2071 2077          zone0.zone_locked_mem_ctl = UINT64_MAX;
2072 2078          ASSERT(zone0.zone_max_swap == 0);
2073 2079          zone0.zone_max_swap_ctl = UINT64_MAX;
2074 2080          zone0.zone_max_lofi = 0;
2075 2081          zone0.zone_max_lofi_ctl = UINT64_MAX;
2076 2082          zone0.zone_shmmax = 0;
2077 2083          zone0.zone_ipc.ipcq_shmmni = 0;
2078 2084          zone0.zone_ipc.ipcq_semmni = 0;
2079 2085          zone0.zone_ipc.ipcq_msgmni = 0;
2080 2086          zone0.zone_name = GLOBAL_ZONENAME;
2081 2087          zone0.zone_nodename = utsname.nodename;
2082 2088          zone0.zone_domain = srpc_domain;
2083 2089          zone0.zone_hostid = HW_INVALID_HOSTID;
2084 2090          zone0.zone_fs_allowed = NULL;
2085 2091          psecflags_default(&zone0.zone_secflags);
2086 2092          zone0.zone_ref = 1;
2087 2093          zone0.zone_id = GLOBAL_ZONEID;
2088 2094          zone0.zone_status = ZONE_IS_RUNNING;
2089 2095          zone0.zone_rootpath = "/";

↓ open down ↓

150 lines elided

↑ open up ↑

2090 2096          zone0.zone_rootpathlen = 2;
2091 2097          zone0.zone_psetid = ZONE_PS_INVAL;
2092 2098          zone0.zone_ncpus = 0;
2093 2099          zone0.zone_ncpus_online = 0;
2094 2100          zone0.zone_proc_initpid = 1;
2095 2101          zone0.zone_initname = initname;
2096 2102          zone0.zone_lockedmem_kstat = NULL;
2097 2103          zone0.zone_swapresv_kstat = NULL;
2098 2104          zone0.zone_nprocs_kstat = NULL;
2099 2105  
2100      -        zone0.zone_stime = 0;
2101      -        zone0.zone_utime = 0;
2102      -        zone0.zone_wtime = 0;
2103      -
2104 2106          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2105 2107              offsetof(zone_ref_t, zref_linkage));
2106 2108          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2107 2109              offsetof(struct zsd_entry, zsd_linkage));
2108 2110          list_insert_head(&zone_active, &zone0);
2109 2111  
2110 2112          /*
2111 2113           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2112 2114           * to anything meaningful.  It is assigned to be 'rootdir' in
2113 2115           * vfs_mountroot().

2114 2116           */
2115 2117          zone0.zone_rootvp = NULL;
2116 2118          zone0.zone_vfslist = NULL;
2117 2119          zone0.zone_bootargs = initargs;
2118 2120          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2119 2121          /*
2120 2122           * The global zone has all privileges
2121 2123           */
2122 2124          priv_fillset(zone0.zone_privset);
2123 2125          /*
2124 2126           * Add p0 to the global zone
2125 2127           */
2126 2128          zone0.zone_zsched = &p0;
2127 2129          p0.p_zone = &zone0;
2128 2130  }
2129 2131  
2130 2132  /*
2131 2133   * Compute a hash value based on the contents of the label and the DOI.  The
2132 2134   * hash algorithm is somewhat arbitrary, but is based on the observation that
2133 2135   * humans will likely pick labels that differ by amounts that work out to be
2134 2136   * multiples of the number of hash chains, and thus stirring in some primes
2135 2137   * should help.
2136 2138   */
2137 2139  static uint_t
2138 2140  hash_bylabel(void *hdata, mod_hash_key_t key)
2139 2141  {
2140 2142          const ts_label_t *lab = (ts_label_t *)key;
2141 2143          const uint32_t *up, *ue;
2142 2144          uint_t hash;
2143 2145          int i;
2144 2146  
2145 2147          _NOTE(ARGUNUSED(hdata));
2146 2148  
2147 2149          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2148 2150          /* we depend on alignment of label, but not representation */
2149 2151          up = (const uint32_t *)&lab->tsl_label;
2150 2152          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2151 2153          i = 1;
2152 2154          while (up < ue) {
2153 2155                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2154 2156                  hash += *up + (*up << ((i % 16) + 1));
2155 2157                  up++;
2156 2158                  i++;
2157 2159          }
2158 2160          return (hash);
2159 2161  }
2160 2162  
2161 2163  /*
2162 2164   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2163 2165   * equal).  This may need to be changed if less than / greater than is ever
2164 2166   * needed.
2165 2167   */
2166 2168  static int
2167 2169  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2168 2170  {
2169 2171          ts_label_t *lab1 = (ts_label_t *)key1;
2170 2172          ts_label_t *lab2 = (ts_label_t *)key2;
2171 2173  
2172 2174          return (label_equal(lab1, lab2) ? 0 : 1);
2173 2175  }
2174 2176  
2175 2177  /*
2176 2178   * Called by main() to initialize the zones framework.
2177 2179   */
2178 2180  void
2179 2181  zone_init(void)
2180 2182  {
2181 2183          rctl_dict_entry_t *rde;
2182 2184          rctl_val_t *dval;
2183 2185          rctl_set_t *set;
2184 2186          rctl_alloc_gp_t *gp;
2185 2187          rctl_entity_p_t e;
2186 2188          int res;
2187 2189  
2188 2190          ASSERT(curproc == &p0);
2189 2191  
2190 2192          /*
2191 2193           * Create ID space for zone IDs.  ID 0 is reserved for the
2192 2194           * global zone.
2193 2195           */
2194 2196          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2195 2197  
2196 2198          /*
2197 2199           * Initialize generic zone resource controls, if any.
2198 2200           */
2199 2201          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2200 2202              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2201 2203              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2202 2204              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2203 2205  
2204 2206          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2205 2207              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2206 2208              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2207 2209              RCTL_GLOBAL_INFINITE,
2208 2210              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2209 2211  
2210 2212          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2211 2213              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2212 2214              INT_MAX, INT_MAX, &zone_lwps_ops);
2213 2215  
2214 2216          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2215 2217              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2216 2218              INT_MAX, INT_MAX, &zone_procs_ops);
2217 2219  
2218 2220          /*
2219 2221           * System V IPC resource controls
2220 2222           */
2221 2223          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2222 2224              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2223 2225              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2224 2226  
2225 2227          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2226 2228              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2227 2229              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2228 2230  
2229 2231          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2230 2232              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2231 2233              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2232 2234  
2233 2235          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2234 2236              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2235 2237              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2236 2238  
2237 2239          /*
2238 2240           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2239 2241           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2240 2242           */
2241 2243          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2242 2244          bzero(dval, sizeof (rctl_val_t));
2243 2245          dval->rcv_value = 1;
2244 2246          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2245 2247          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2246 2248          dval->rcv_action_recip_pid = -1;
2247 2249  
2248 2250          rde = rctl_dict_lookup("zone.cpu-shares");
2249 2251          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2250 2252  
2251 2253          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2252 2254              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2253 2255              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2254 2256              &zone_locked_mem_ops);
2255 2257  
2256 2258          rc_zone_max_swap = rctl_register("zone.max-swap",
2257 2259              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2258 2260              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2259 2261              &zone_max_swap_ops);
2260 2262  
2261 2263          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2262 2264              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2263 2265              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2264 2266              &zone_max_lofi_ops);
2265 2267  
2266 2268          /*
2267 2269           * Initialize the ``global zone''.
2268 2270           */
2269 2271          set = rctl_set_create();
2270 2272          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2271 2273          mutex_enter(&p0.p_lock);
2272 2274          e.rcep_p.zone = &zone0;
2273 2275          e.rcep_t = RCENTITY_ZONE;
2274 2276          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2275 2277              gp);
2276 2278  
2277 2279          zone0.zone_nlwps = p0.p_lwpcnt;
2278 2280          zone0.zone_nprocs = 1;
2279 2281          zone0.zone_ntasks = 1;
2280 2282          mutex_exit(&p0.p_lock);
2281 2283          zone0.zone_restart_init = B_TRUE;
2282 2284          zone0.zone_brand = &native_brand;
2283 2285          rctl_prealloc_destroy(gp);
2284 2286          /*
2285 2287           * pool_default hasn't been initialized yet, so we let pool_init()
2286 2288           * take care of making sure the global zone is in the default pool.
2287 2289           */
2288 2290  
2289 2291          /*
2290 2292           * Initialize global zone kstats
2291 2293           */
2292 2294          zone_kstat_create(&zone0);
2293 2295  
2294 2296          /*
2295 2297           * Initialize zone label.
2296 2298           * mlp are initialized when tnzonecfg is loaded.

↓ open down ↓

183 lines elided

↑ open up ↑

2297 2299           */
2298 2300          zone0.zone_slabel = l_admin_low;
2299 2301          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2300 2302          label_hold(l_admin_low);
2301 2303  
2302 2304          /*
2303 2305           * Initialise the lock for the database structure used by mntfs.
2304 2306           */
2305 2307          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2306 2308  
     2309 +        zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
     2310 +
2307 2311          mutex_enter(&zonehash_lock);
2308 2312          zone_uniqid(&zone0);
2309 2313          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2310 2314  
2311 2315          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2312 2316              mod_hash_null_valdtor);
2313 2317          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2314 2318              zone_hash_size, mod_hash_null_valdtor);
2315 2319          /*
2316 2320           * maintain zonehashbylabel only for labeled systems

2317 2321           */
2318 2322          if (is_system_labeled())
2319 2323                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2320 2324                      zone_hash_size, mod_hash_null_keydtor,
2321 2325                      mod_hash_null_valdtor, hash_bylabel, NULL,
2322 2326                      hash_labelkey_cmp, KM_SLEEP);
2323 2327          zonecount = 1;
2324 2328  
2325 2329          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2326 2330              (mod_hash_val_t)&zone0);
2327 2331          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2328 2332              (mod_hash_val_t)&zone0);
2329 2333          if (is_system_labeled()) {
2330 2334                  zone0.zone_flags |= ZF_HASHED_LABEL;
2331 2335                  (void) mod_hash_insert(zonehashbylabel,
2332 2336                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2333 2337          }
2334 2338          mutex_exit(&zonehash_lock);
2335 2339  
2336 2340          /*
2337 2341           * We avoid setting zone_kcred until now, since kcred is initialized
2338 2342           * sometime after zone_zsd_init() and before zone_init().
2339 2343           */
2340 2344          zone0.zone_kcred = kcred;
2341 2345          /*
2342 2346           * The global zone is fully initialized (except for zone_rootvp which
2343 2347           * will be set when the root filesystem is mounted).
2344 2348           */
2345 2349          global_zone = &zone0;
2346 2350  
2347 2351          /*
2348 2352           * Setup an event channel to send zone status change notifications on
2349 2353           */
2350 2354          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2351 2355              EVCH_CREAT);
2352 2356  
2353 2357          if (res)
2354 2358                  panic("Sysevent_evc_bind failed during zone setup.\n");
2355 2359  
2356 2360  }
2357 2361  
2358 2362  static void
2359 2363  zone_free(zone_t *zone)
2360 2364  {
2361 2365          ASSERT(zone != global_zone);
2362 2366          ASSERT(zone->zone_ntasks == 0);
2363 2367          ASSERT(zone->zone_nlwps == 0);
2364 2368          ASSERT(zone->zone_nprocs == 0);
2365 2369          ASSERT(zone->zone_cred_ref == 0);
2366 2370          ASSERT(zone->zone_kcred == NULL);
2367 2371          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2368 2372              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2369 2373          ASSERT(list_is_empty(&zone->zone_ref_list));
2370 2374  
2371 2375          /*
2372 2376           * Remove any zone caps.
2373 2377           */
2374 2378          cpucaps_zone_remove(zone);
2375 2379  
2376 2380          ASSERT(zone->zone_cpucap == NULL);
2377 2381  
2378 2382          /* remove from deathrow list */
2379 2383          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2380 2384                  ASSERT(zone->zone_ref == 0);

↓ open down ↓

64 lines elided

↑ open up ↑

2381 2385                  mutex_enter(&zone_deathrow_lock);
2382 2386                  list_remove(&zone_deathrow, zone);
2383 2387                  mutex_exit(&zone_deathrow_lock);
2384 2388          }
2385 2389  
2386 2390          list_destroy(&zone->zone_ref_list);
2387 2391          zone_free_zsd(zone);
2388 2392          zone_free_datasets(zone);
2389 2393          list_destroy(&zone->zone_dl_list);
2390 2394  
     2395 +        cpu_uarray_free(zone->zone_ustate);
     2396 +
2391 2397          if (zone->zone_rootvp != NULL)
2392 2398                  VN_RELE(zone->zone_rootvp);
2393 2399          if (zone->zone_rootpath)
2394 2400                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2395 2401          if (zone->zone_name != NULL)
2396 2402                  kmem_free(zone->zone_name, ZONENAME_MAX);
2397 2403          if (zone->zone_slabel != NULL)
2398 2404                  label_rele(zone->zone_slabel);
2399 2405          if (zone->zone_nodename != NULL)
2400 2406                  kmem_free(zone->zone_nodename, _SYS_NMLN);

2401 2407          if (zone->zone_domain != NULL)
2402 2408                  kmem_free(zone->zone_domain, _SYS_NMLN);
2403 2409          if (zone->zone_privset != NULL)
2404 2410                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2405 2411          if (zone->zone_rctls != NULL)
2406 2412                  rctl_set_free(zone->zone_rctls);
2407 2413          if (zone->zone_bootargs != NULL)
2408 2414                  strfree(zone->zone_bootargs);
2409 2415          if (zone->zone_initname != NULL)
2410 2416                  strfree(zone->zone_initname);
2411 2417          if (zone->zone_fs_allowed != NULL)
2412 2418                  strfree(zone->zone_fs_allowed);
2413 2419          if (zone->zone_pfexecd != NULL)
2414 2420                  klpd_freelist(&zone->zone_pfexecd);
2415 2421          id_free(zoneid_space, zone->zone_id);
2416 2422          mutex_destroy(&zone->zone_lock);
2417 2423          cv_destroy(&zone->zone_cv);
2418 2424          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2419 2425          rw_destroy(&zone->zone_mntfs_db_lock);
2420 2426          kmem_free(zone, sizeof (zone_t));
2421 2427  }
2422 2428  
2423 2429  /*
2424 2430   * See block comment at the top of this file for information about zone
2425 2431   * status values.
2426 2432   */
2427 2433  /*
2428 2434   * Convenience function for setting zone status.
2429 2435   */
2430 2436  static void
2431 2437  zone_status_set(zone_t *zone, zone_status_t status)
2432 2438  {
2433 2439  
2434 2440          nvlist_t *nvl = NULL;
2435 2441          ASSERT(MUTEX_HELD(&zone_status_lock));
2436 2442          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2437 2443              status >= zone_status_get(zone));
2438 2444  
2439 2445          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2440 2446              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2441 2447              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2442 2448              zone_status_table[status]) ||
2443 2449              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2444 2450              zone_status_table[zone->zone_status]) ||
2445 2451              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2446 2452              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2447 2453              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2448 2454              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2449 2455  #ifdef DEBUG
2450 2456                  (void) printf(
2451 2457                      "Failed to allocate and send zone state change event.\n");
2452 2458  #endif
2453 2459          }
2454 2460          nvlist_free(nvl);
2455 2461  
2456 2462          zone->zone_status = status;
2457 2463  
2458 2464          cv_broadcast(&zone->zone_cv);
2459 2465  }
2460 2466  
2461 2467  /*
2462 2468   * Public function to retrieve the zone status.  The zone status may
2463 2469   * change after it is retrieved.
2464 2470   */
2465 2471  zone_status_t
2466 2472  zone_status_get(zone_t *zone)
2467 2473  {
2468 2474          return (zone->zone_status);
2469 2475  }
2470 2476  
2471 2477  static int
2472 2478  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2473 2479  {
2474 2480          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2475 2481          int err = 0;
2476 2482  
2477 2483          ASSERT(zone != global_zone);
2478 2484          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2479 2485                  goto done;      /* EFAULT or ENAMETOOLONG */
2480 2486  
2481 2487          if (zone->zone_bootargs != NULL)
2482 2488                  strfree(zone->zone_bootargs);
2483 2489  
2484 2490          zone->zone_bootargs = strdup(buf);
2485 2491  
2486 2492  done:
2487 2493          kmem_free(buf, BOOTARGS_MAX);
2488 2494          return (err);
2489 2495  }
2490 2496  
2491 2497  static int
2492 2498  zone_set_brand(zone_t *zone, const char *brand)
2493 2499  {
2494 2500          struct brand_attr *attrp;
2495 2501          brand_t *bp;
2496 2502  
2497 2503          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2498 2504          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2499 2505                  kmem_free(attrp, sizeof (struct brand_attr));
2500 2506                  return (EFAULT);
2501 2507          }
2502 2508  
2503 2509          bp = brand_register_zone(attrp);
2504 2510          kmem_free(attrp, sizeof (struct brand_attr));
2505 2511          if (bp == NULL)
2506 2512                  return (EINVAL);
2507 2513  
2508 2514          /*
2509 2515           * This is the only place where a zone can change it's brand.
2510 2516           * We already need to hold zone_status_lock to check the zone
2511 2517           * status, so we'll just use that lock to serialize zone
2512 2518           * branding requests as well.
2513 2519           */
2514 2520          mutex_enter(&zone_status_lock);
2515 2521  
2516 2522          /* Re-Branding is not allowed and the zone can't be booted yet */
2517 2523          if ((ZONE_IS_BRANDED(zone)) ||
2518 2524              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2519 2525                  mutex_exit(&zone_status_lock);
2520 2526                  brand_unregister_zone(bp);
2521 2527                  return (EINVAL);
2522 2528          }
2523 2529  
2524 2530          /* set up the brand specific data */
2525 2531          zone->zone_brand = bp;
2526 2532          ZBROP(zone)->b_init_brand_data(zone);
2527 2533  
2528 2534          mutex_exit(&zone_status_lock);
2529 2535          return (0);
2530 2536  }
2531 2537  
2532 2538  static int
2533 2539  zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2534 2540  {
2535 2541          int err = 0;
2536 2542          psecflags_t psf;
2537 2543  
2538 2544          ASSERT(zone != global_zone);
2539 2545  
2540 2546          if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2541 2547                  return (err);
2542 2548  
2543 2549          if (zone_status_get(zone) > ZONE_IS_READY)
2544 2550                  return (EINVAL);
2545 2551  
2546 2552          if (!psecflags_validate(&psf))
2547 2553                  return (EINVAL);
2548 2554  
2549 2555          (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2550 2556  
2551 2557          /* Set security flags on the zone's zsched */
2552 2558          (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2553 2559              sizeof (zone->zone_zsched->p_secflags));
2554 2560  
2555 2561          return (0);
2556 2562  }
2557 2563  
2558 2564  static int
2559 2565  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2560 2566  {
2561 2567          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2562 2568          int err = 0;
2563 2569  
2564 2570          ASSERT(zone != global_zone);
2565 2571          if ((err = copyinstr(zone_fs_allowed, buf,
2566 2572              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2567 2573                  goto done;
2568 2574  
2569 2575          if (zone->zone_fs_allowed != NULL)
2570 2576                  strfree(zone->zone_fs_allowed);
2571 2577  
2572 2578          zone->zone_fs_allowed = strdup(buf);
2573 2579  
2574 2580  done:
2575 2581          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2576 2582          return (err);
2577 2583  }
2578 2584  
2579 2585  static int
2580 2586  zone_set_initname(zone_t *zone, const char *zone_initname)
2581 2587  {
2582 2588          char initname[INITNAME_SZ];
2583 2589          size_t len;
2584 2590          int err = 0;
2585 2591  
2586 2592          ASSERT(zone != global_zone);
2587 2593          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2588 2594                  return (err);   /* EFAULT or ENAMETOOLONG */
2589 2595  
2590 2596          if (zone->zone_initname != NULL)
2591 2597                  strfree(zone->zone_initname);
2592 2598  
2593 2599          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2594 2600          (void) strcpy(zone->zone_initname, initname);
2595 2601          return (0);
2596 2602  }
2597 2603  
2598 2604  static int
2599 2605  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2600 2606  {
2601 2607          uint64_t mcap;
2602 2608          int err = 0;
2603 2609  
2604 2610          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2605 2611                  zone->zone_phys_mcap = mcap;
2606 2612  
2607 2613          return (err);
2608 2614  }
2609 2615  
2610 2616  static int
2611 2617  zone_set_sched_class(zone_t *zone, const char *new_class)
2612 2618  {
2613 2619          char sched_class[PC_CLNMSZ];
2614 2620          id_t classid;
2615 2621          int err;
2616 2622  
2617 2623          ASSERT(zone != global_zone);
2618 2624          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2619 2625                  return (err);   /* EFAULT or ENAMETOOLONG */
2620 2626  
2621 2627          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2622 2628                  return (set_errno(EINVAL));
2623 2629          zone->zone_defaultcid = classid;
2624 2630          ASSERT(zone->zone_defaultcid > 0 &&
2625 2631              zone->zone_defaultcid < loaded_classes);
2626 2632  
2627 2633          return (0);
2628 2634  }
2629 2635  
2630 2636  /*
2631 2637   * Block indefinitely waiting for (zone_status >= status)
2632 2638   */
2633 2639  void
2634 2640  zone_status_wait(zone_t *zone, zone_status_t status)
2635 2641  {
2636 2642          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2637 2643  
2638 2644          mutex_enter(&zone_status_lock);
2639 2645          while (zone->zone_status < status) {
2640 2646                  cv_wait(&zone->zone_cv, &zone_status_lock);
2641 2647          }
2642 2648          mutex_exit(&zone_status_lock);
2643 2649  }
2644 2650  
2645 2651  /*
2646 2652   * Private CPR-safe version of zone_status_wait().
2647 2653   */
2648 2654  static void
2649 2655  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2650 2656  {
2651 2657          callb_cpr_t cprinfo;
2652 2658  
2653 2659          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2654 2660  
2655 2661          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2656 2662              str);
2657 2663          mutex_enter(&zone_status_lock);
2658 2664          while (zone->zone_status < status) {
2659 2665                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2660 2666                  cv_wait(&zone->zone_cv, &zone_status_lock);
2661 2667                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2662 2668          }
2663 2669          /*
2664 2670           * zone_status_lock is implicitly released by the following.
2665 2671           */
2666 2672          CALLB_CPR_EXIT(&cprinfo);
2667 2673  }
2668 2674  
2669 2675  /*
2670 2676   * Block until zone enters requested state or signal is received.  Return (0)
2671 2677   * if signaled, non-zero otherwise.
2672 2678   */
2673 2679  int
2674 2680  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2675 2681  {
2676 2682          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2677 2683  
2678 2684          mutex_enter(&zone_status_lock);
2679 2685          while (zone->zone_status < status) {
2680 2686                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2681 2687                          mutex_exit(&zone_status_lock);
2682 2688                          return (0);
2683 2689                  }
2684 2690          }
2685 2691          mutex_exit(&zone_status_lock);
2686 2692          return (1);
2687 2693  }
2688 2694  
2689 2695  /*
2690 2696   * Block until the zone enters the requested state or the timeout expires,
2691 2697   * whichever happens first.  Return (-1) if operation timed out, time remaining
2692 2698   * otherwise.
2693 2699   */
2694 2700  clock_t
2695 2701  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2696 2702  {
2697 2703          clock_t timeleft = 0;
2698 2704  
2699 2705          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2700 2706  
2701 2707          mutex_enter(&zone_status_lock);
2702 2708          while (zone->zone_status < status && timeleft != -1) {
2703 2709                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2704 2710          }
2705 2711          mutex_exit(&zone_status_lock);
2706 2712          return (timeleft);
2707 2713  }
2708 2714  
2709 2715  /*
2710 2716   * Block until the zone enters the requested state, the current process is
2711 2717   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2712 2718   * operation timed out, 0 if signaled, time remaining otherwise.
2713 2719   */
2714 2720  clock_t
2715 2721  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2716 2722  {
2717 2723          clock_t timeleft = tim - ddi_get_lbolt();
2718 2724  
2719 2725          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2720 2726  
2721 2727          mutex_enter(&zone_status_lock);
2722 2728          while (zone->zone_status < status) {
2723 2729                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2724 2730                      tim);
2725 2731                  if (timeleft <= 0)
2726 2732                          break;
2727 2733          }
2728 2734          mutex_exit(&zone_status_lock);
2729 2735          return (timeleft);
2730 2736  }
2731 2737  
2732 2738  /*
2733 2739   * Zones have two reference counts: one for references from credential
2734 2740   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2735 2741   * This is so we can allow a zone to be rebooted while there are still
2736 2742   * outstanding cred references, since certain drivers cache dblks (which
2737 2743   * implicitly results in cached creds).  We wait for zone_ref to drop to
2738 2744   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2739 2745   * later freed when the zone_cred_ref drops to 0, though nothing other
2740 2746   * than the zone id and privilege set should be accessed once the zone
2741 2747   * is "dead".
2742 2748   *
2743 2749   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2744 2750   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2745 2751   * to 0.  This can be useful to flush out other sources of cached creds
2746 2752   * that may be less innocuous than the driver case.
2747 2753   *
2748 2754   * Zones also provide a tracked reference counting mechanism in which zone
2749 2755   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2750 2756   * debuggers determine the sources of leaked zone references.  See
2751 2757   * zone_hold_ref() and zone_rele_ref() below for more information.
2752 2758   */
2753 2759  
2754 2760  int zone_wait_for_cred = 0;
2755 2761  
2756 2762  static void
2757 2763  zone_hold_locked(zone_t *z)
2758 2764  {
2759 2765          ASSERT(MUTEX_HELD(&z->zone_lock));
2760 2766          z->zone_ref++;
2761 2767          ASSERT(z->zone_ref != 0);
2762 2768  }
2763 2769  
2764 2770  /*
2765 2771   * Increment the specified zone's reference count.  The zone's zone_t structure
2766 2772   * will not be freed as long as the zone's reference count is nonzero.
2767 2773   * Decrement the zone's reference count via zone_rele().
2768 2774   *
2769 2775   * NOTE: This function should only be used to hold zones for short periods of
2770 2776   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2771 2777   */
2772 2778  void
2773 2779  zone_hold(zone_t *z)
2774 2780  {
2775 2781          mutex_enter(&z->zone_lock);
2776 2782          zone_hold_locked(z);
2777 2783          mutex_exit(&z->zone_lock);
2778 2784  }
2779 2785  
2780 2786  /*
2781 2787   * If the non-cred ref count drops to 1 and either the cred ref count
2782 2788   * is 0 or we aren't waiting for cred references, the zone is ready to
2783 2789   * be destroyed.
2784 2790   */
2785 2791  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2786 2792              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2787 2793  
2788 2794  /*
2789 2795   * Common zone reference release function invoked by zone_rele() and
2790 2796   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2791 2797   * zone's subsystem-specific reference counters are not affected by the
2792 2798   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2793 2799   * removed from the specified zone's reference list.  ref must be non-NULL iff
2794 2800   * subsys is not ZONE_REF_NUM_SUBSYS.
2795 2801   */
2796 2802  static void
2797 2803  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2798 2804  {
2799 2805          boolean_t wakeup;
2800 2806  
2801 2807          mutex_enter(&z->zone_lock);
2802 2808          ASSERT(z->zone_ref != 0);
2803 2809          z->zone_ref--;
2804 2810          if (subsys != ZONE_REF_NUM_SUBSYS) {
2805 2811                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2806 2812                  z->zone_subsys_ref[subsys]--;
2807 2813                  list_remove(&z->zone_ref_list, ref);
2808 2814          }
2809 2815          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2810 2816                  /* no more refs, free the structure */
2811 2817                  mutex_exit(&z->zone_lock);
2812 2818                  zone_free(z);
2813 2819                  return;
2814 2820          }
2815 2821          /* signal zone_destroy so the zone can finish halting */
2816 2822          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2817 2823          mutex_exit(&z->zone_lock);
2818 2824  
2819 2825          if (wakeup) {
2820 2826                  /*
2821 2827                   * Grabbing zonehash_lock here effectively synchronizes with
2822 2828                   * zone_destroy() to avoid missed signals.
2823 2829                   */
2824 2830                  mutex_enter(&zonehash_lock);
2825 2831                  cv_broadcast(&zone_destroy_cv);
2826 2832                  mutex_exit(&zonehash_lock);
2827 2833          }
2828 2834  }
2829 2835  
2830 2836  /*
2831 2837   * Decrement the specified zone's reference count.  The specified zone will
2832 2838   * cease to exist after this function returns if the reference count drops to
2833 2839   * zero.  This function should be paired with zone_hold().
2834 2840   */
2835 2841  void
2836 2842  zone_rele(zone_t *z)
2837 2843  {
2838 2844          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2839 2845  }
2840 2846  
2841 2847  /*
2842 2848   * Initialize a zone reference structure.  This function must be invoked for
2843 2849   * a reference structure before the structure is passed to zone_hold_ref().
2844 2850   */
2845 2851  void
2846 2852  zone_init_ref(zone_ref_t *ref)
2847 2853  {
2848 2854          ref->zref_zone = NULL;
2849 2855          list_link_init(&ref->zref_linkage);
2850 2856  }
2851 2857  
2852 2858  /*
2853 2859   * Acquire a reference to zone z.  The caller must specify the
2854 2860   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2855 2861   * zone_ref_t structure will represent a reference to the specified zone.  Use
2856 2862   * zone_rele_ref() to release the reference.
2857 2863   *
2858 2864   * The referenced zone_t structure will not be freed as long as the zone_t's
2859 2865   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2860 2866   * references.
2861 2867   *
2862 2868   * NOTE: The zone_ref_t structure must be initialized before it is used.
2863 2869   * See zone_init_ref() above.
2864 2870   */
2865 2871  void
2866 2872  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2867 2873  {
2868 2874          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2869 2875  
2870 2876          /*
2871 2877           * Prevent consumers from reusing a reference structure before
2872 2878           * releasing it.
2873 2879           */
2874 2880          VERIFY(ref->zref_zone == NULL);
2875 2881  
2876 2882          ref->zref_zone = z;
2877 2883          mutex_enter(&z->zone_lock);
2878 2884          zone_hold_locked(z);
2879 2885          z->zone_subsys_ref[subsys]++;
2880 2886          ASSERT(z->zone_subsys_ref[subsys] != 0);
2881 2887          list_insert_head(&z->zone_ref_list, ref);
2882 2888          mutex_exit(&z->zone_lock);
2883 2889  }
2884 2890  
2885 2891  /*
2886 2892   * Release the zone reference represented by the specified zone_ref_t.
2887 2893   * The reference is invalid after it's released; however, the zone_ref_t
2888 2894   * structure can be reused without having to invoke zone_init_ref().
2889 2895   * subsys should be the same value that was passed to zone_hold_ref()
2890 2896   * when the reference was acquired.
2891 2897   */
2892 2898  void
2893 2899  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2894 2900  {
2895 2901          zone_rele_common(ref->zref_zone, ref, subsys);
2896 2902  
2897 2903          /*
2898 2904           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2899 2905           * when consumers dereference the reference.  This helps us catch
2900 2906           * consumers who use released references.  Furthermore, this lets
2901 2907           * consumers reuse the zone_ref_t structure without having to
2902 2908           * invoke zone_init_ref().
2903 2909           */
2904 2910          ref->zref_zone = NULL;
2905 2911  }
2906 2912  
2907 2913  void
2908 2914  zone_cred_hold(zone_t *z)
2909 2915  {
2910 2916          mutex_enter(&z->zone_lock);
2911 2917          z->zone_cred_ref++;
2912 2918          ASSERT(z->zone_cred_ref != 0);
2913 2919          mutex_exit(&z->zone_lock);
2914 2920  }
2915 2921  
2916 2922  void
2917 2923  zone_cred_rele(zone_t *z)
2918 2924  {
2919 2925          boolean_t wakeup;
2920 2926  
2921 2927          mutex_enter(&z->zone_lock);
2922 2928          ASSERT(z->zone_cred_ref != 0);
2923 2929          z->zone_cred_ref--;
2924 2930          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2925 2931                  /* no more refs, free the structure */
2926 2932                  mutex_exit(&z->zone_lock);
2927 2933                  zone_free(z);
2928 2934                  return;
2929 2935          }
2930 2936          /*
2931 2937           * If zone_destroy is waiting for the cred references to drain
2932 2938           * out, and they have, signal it.
2933 2939           */
2934 2940          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2935 2941              zone_status_get(z) >= ZONE_IS_DEAD);
2936 2942          mutex_exit(&z->zone_lock);
2937 2943  
2938 2944          if (wakeup) {
2939 2945                  /*
2940 2946                   * Grabbing zonehash_lock here effectively synchronizes with
2941 2947                   * zone_destroy() to avoid missed signals.
2942 2948                   */
2943 2949                  mutex_enter(&zonehash_lock);
2944 2950                  cv_broadcast(&zone_destroy_cv);
2945 2951                  mutex_exit(&zonehash_lock);
2946 2952          }
2947 2953  }
2948 2954  
2949 2955  void
2950 2956  zone_task_hold(zone_t *z)
2951 2957  {
2952 2958          mutex_enter(&z->zone_lock);
2953 2959          z->zone_ntasks++;
2954 2960          ASSERT(z->zone_ntasks != 0);
2955 2961          mutex_exit(&z->zone_lock);
2956 2962  }
2957 2963  
2958 2964  void
2959 2965  zone_task_rele(zone_t *zone)
2960 2966  {
2961 2967          uint_t refcnt;
2962 2968  
2963 2969          mutex_enter(&zone->zone_lock);
2964 2970          ASSERT(zone->zone_ntasks != 0);
2965 2971          refcnt = --zone->zone_ntasks;
2966 2972          if (refcnt > 1) {       /* Common case */
2967 2973                  mutex_exit(&zone->zone_lock);
2968 2974                  return;
2969 2975          }
2970 2976          zone_hold_locked(zone); /* so we can use the zone_t later */
2971 2977          mutex_exit(&zone->zone_lock);
2972 2978          if (refcnt == 1) {
2973 2979                  /*
2974 2980                   * See if the zone is shutting down.
2975 2981                   */
2976 2982                  mutex_enter(&zone_status_lock);
2977 2983                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2978 2984                          goto out;
2979 2985                  }
2980 2986  
2981 2987                  /*
2982 2988                   * Make sure the ntasks didn't change since we
2983 2989                   * dropped zone_lock.
2984 2990                   */
2985 2991                  mutex_enter(&zone->zone_lock);
2986 2992                  if (refcnt != zone->zone_ntasks) {
2987 2993                          mutex_exit(&zone->zone_lock);
2988 2994                          goto out;
2989 2995                  }
2990 2996                  mutex_exit(&zone->zone_lock);
2991 2997  
2992 2998                  /*
2993 2999                   * No more user processes in the zone.  The zone is empty.
2994 3000                   */
2995 3001                  zone_status_set(zone, ZONE_IS_EMPTY);
2996 3002                  goto out;
2997 3003          }
2998 3004  
2999 3005          ASSERT(refcnt == 0);
3000 3006          /*
3001 3007           * zsched has exited; the zone is dead.
3002 3008           */
3003 3009          zone->zone_zsched = NULL;               /* paranoia */
3004 3010          mutex_enter(&zone_status_lock);
3005 3011          zone_status_set(zone, ZONE_IS_DEAD);
3006 3012  out:
3007 3013          mutex_exit(&zone_status_lock);
3008 3014          zone_rele(zone);
3009 3015  }
3010 3016  
3011 3017  zoneid_t
3012 3018  getzoneid(void)
3013 3019  {
3014 3020          return (curproc->p_zone->zone_id);
3015 3021  }
3016 3022  
3017 3023  /*
3018 3024   * Internal versions of zone_find_by_*().  These don't zone_hold() or
3019 3025   * check the validity of a zone's state.
3020 3026   */
3021 3027  static zone_t *
3022 3028  zone_find_all_by_id(zoneid_t zoneid)
3023 3029  {
3024 3030          mod_hash_val_t hv;
3025 3031          zone_t *zone = NULL;
3026 3032  
3027 3033          ASSERT(MUTEX_HELD(&zonehash_lock));
3028 3034  
3029 3035          if (mod_hash_find(zonehashbyid,
3030 3036              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3031 3037                  zone = (zone_t *)hv;
3032 3038          return (zone);
3033 3039  }
3034 3040  
3035 3041  static zone_t *
3036 3042  zone_find_all_by_label(const ts_label_t *label)
3037 3043  {
3038 3044          mod_hash_val_t hv;
3039 3045          zone_t *zone = NULL;
3040 3046  
3041 3047          ASSERT(MUTEX_HELD(&zonehash_lock));
3042 3048  
3043 3049          /*
3044 3050           * zonehashbylabel is not maintained for unlabeled systems
3045 3051           */
3046 3052          if (!is_system_labeled())
3047 3053                  return (NULL);
3048 3054          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3049 3055                  zone = (zone_t *)hv;
3050 3056          return (zone);
3051 3057  }
3052 3058  
3053 3059  static zone_t *
3054 3060  zone_find_all_by_name(char *name)
3055 3061  {
3056 3062          mod_hash_val_t hv;
3057 3063          zone_t *zone = NULL;
3058 3064  
3059 3065          ASSERT(MUTEX_HELD(&zonehash_lock));
3060 3066  
3061 3067          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3062 3068                  zone = (zone_t *)hv;
3063 3069          return (zone);
3064 3070  }
3065 3071  
3066 3072  /*
3067 3073   * Public interface for looking up a zone by zoneid.  Only returns the zone if
3068 3074   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3069 3075   * Caller must call zone_rele() once it is done with the zone.
3070 3076   *
3071 3077   * The zone may begin the zone_destroy() sequence immediately after this
3072 3078   * function returns, but may be safely used until zone_rele() is called.
3073 3079   */
3074 3080  zone_t *
3075 3081  zone_find_by_id(zoneid_t zoneid)
3076 3082  {
3077 3083          zone_t *zone;
3078 3084          zone_status_t status;
3079 3085  
3080 3086          mutex_enter(&zonehash_lock);
3081 3087          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3082 3088                  mutex_exit(&zonehash_lock);
3083 3089                  return (NULL);
3084 3090          }
3085 3091          status = zone_status_get(zone);
3086 3092          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3087 3093                  /*
3088 3094                   * For all practical purposes the zone doesn't exist.
3089 3095                   */
3090 3096                  mutex_exit(&zonehash_lock);
3091 3097                  return (NULL);
3092 3098          }
3093 3099          zone_hold(zone);
3094 3100          mutex_exit(&zonehash_lock);
3095 3101          return (zone);
3096 3102  }
3097 3103  
3098 3104  /*
3099 3105   * Similar to zone_find_by_id, but using zone label as the key.
3100 3106   */
3101 3107  zone_t *
3102 3108  zone_find_by_label(const ts_label_t *label)
3103 3109  {
3104 3110          zone_t *zone;
3105 3111          zone_status_t status;
3106 3112  
3107 3113          mutex_enter(&zonehash_lock);
3108 3114          if ((zone = zone_find_all_by_label(label)) == NULL) {
3109 3115                  mutex_exit(&zonehash_lock);
3110 3116                  return (NULL);
3111 3117          }
3112 3118  
3113 3119          status = zone_status_get(zone);
3114 3120          if (status > ZONE_IS_DOWN) {
3115 3121                  /*
3116 3122                   * For all practical purposes the zone doesn't exist.
3117 3123                   */
3118 3124                  mutex_exit(&zonehash_lock);
3119 3125                  return (NULL);
3120 3126          }
3121 3127          zone_hold(zone);
3122 3128          mutex_exit(&zonehash_lock);
3123 3129          return (zone);
3124 3130  }
3125 3131  
3126 3132  /*
3127 3133   * Similar to zone_find_by_id, but using zone name as the key.
3128 3134   */
3129 3135  zone_t *
3130 3136  zone_find_by_name(char *name)
3131 3137  {
3132 3138          zone_t *zone;
3133 3139          zone_status_t status;
3134 3140  
3135 3141          mutex_enter(&zonehash_lock);
3136 3142          if ((zone = zone_find_all_by_name(name)) == NULL) {
3137 3143                  mutex_exit(&zonehash_lock);
3138 3144                  return (NULL);
3139 3145          }
3140 3146          status = zone_status_get(zone);
3141 3147          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3142 3148                  /*
3143 3149                   * For all practical purposes the zone doesn't exist.
3144 3150                   */
3145 3151                  mutex_exit(&zonehash_lock);
3146 3152                  return (NULL);
3147 3153          }
3148 3154          zone_hold(zone);
3149 3155          mutex_exit(&zonehash_lock);
3150 3156          return (zone);
3151 3157  }
3152 3158  
3153 3159  /*
3154 3160   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3155 3161   * if there is a zone "foo" rooted at /foo/root, and the path argument
3156 3162   * is "/foo/root/proc", it will return the held zone_t corresponding to
3157 3163   * zone "foo".
3158 3164   *
3159 3165   * zone_find_by_path() always returns a non-NULL value, since at the
3160 3166   * very least every path will be contained in the global zone.
3161 3167   *
3162 3168   * As with the other zone_find_by_*() functions, the caller is
3163 3169   * responsible for zone_rele()ing the return value of this function.
3164 3170   */
3165 3171  zone_t *
3166 3172  zone_find_by_path(const char *path)
3167 3173  {
3168 3174          zone_t *zone;
3169 3175          zone_t *zret = NULL;
3170 3176          zone_status_t status;
3171 3177  
3172 3178          if (path == NULL) {
3173 3179                  /*
3174 3180                   * Call from rootconf().
3175 3181                   */
3176 3182                  zone_hold(global_zone);
3177 3183                  return (global_zone);
3178 3184          }
3179 3185          ASSERT(*path == '/');
3180 3186          mutex_enter(&zonehash_lock);
3181 3187          for (zone = list_head(&zone_active); zone != NULL;
3182 3188              zone = list_next(&zone_active, zone)) {
3183 3189                  if (ZONE_PATH_VISIBLE(path, zone))
3184 3190                          zret = zone;
3185 3191          }
3186 3192          ASSERT(zret != NULL);
3187 3193          status = zone_status_get(zret);
3188 3194          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3189 3195                  /*
3190 3196                   * Zone practically doesn't exist.
3191 3197                   */
3192 3198                  zret = global_zone;
3193 3199          }
3194 3200          zone_hold(zret);
3195 3201          mutex_exit(&zonehash_lock);

↓ open down ↓

795 lines elided

↑ open up ↑

3196 3202          return (zret);
3197 3203  }
3198 3204  
3199 3205  /*
3200 3206   * Public interface for updating per-zone load averages.  Called once per
3201 3207   * second.
3202 3208   *
3203 3209   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3204 3210   */
3205 3211  void
3206      -zone_loadavg_update()
     3212 +zone_loadavg_update(void)
3207 3213  {
3208 3214          zone_t *zp;
3209 3215          zone_status_t status;
3210 3216          struct loadavg_s *lavg;
3211 3217          hrtime_t zone_total;
     3218 +        uint64_t tmp;
3212 3219          int i;
3213 3220          hrtime_t hr_avg;
3214 3221          int nrun;
3215 3222          static int64_t f[3] = { 135, 27, 9 };
3216 3223          int64_t q, r;
3217 3224  
3218 3225          mutex_enter(&zonehash_lock);
3219 3226          for (zp = list_head(&zone_active); zp != NULL;
3220 3227              zp = list_next(&zone_active, zp)) {
3221 3228                  mutex_enter(&zp->zone_lock);

3222 3229  
3223 3230                  /* Skip zones that are on the way down or not yet up */
3224 3231                  status = zone_status_get(zp);
3225 3232                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {

↓ open down ↓

4 lines elided

↑ open up ↑

3226 3233                          /* For all practical purposes the zone doesn't exist. */
3227 3234                          mutex_exit(&zp->zone_lock);
3228 3235                          continue;
3229 3236                  }
3230 3237  
3231 3238                  /*
3232 3239                   * Update the 10 second moving average data in zone_loadavg.
3233 3240                   */
3234 3241                  lavg = &zp->zone_loadavg;
3235 3242  
3236      -                zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
     3243 +                tmp = cpu_uarray_sum_all(zp->zone_ustate);
     3244 +                zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
     3245 +
3237 3246                  scalehrtime(&zone_total);
3238 3247  
3239 3248                  /* The zone_total should always be increasing. */
3240 3249                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3241 3250                      zone_total - lavg->lg_total : 0;
3242 3251                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3243 3252                  /* lg_total holds the prev. 1 sec. total */
3244 3253                  lavg->lg_total = zone_total;
3245 3254  
3246 3255                  /*

3247 3256                   * To simplify the calculation, we don't calculate the load avg.
3248 3257                   * until the zone has been up for at least 10 seconds and our
3249 3258                   * moving average is thus full.
3250 3259                   */
3251 3260                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3252 3261                          lavg->lg_len++;
3253 3262                          mutex_exit(&zp->zone_lock);
3254 3263                          continue;
3255 3264                  }
3256 3265  
3257 3266                  /* Now calculate the 1min, 5min, 15 min load avg. */
3258 3267                  hr_avg = 0;
3259 3268                  for (i = 0; i < S_LOADAVG_SZ; i++)
3260 3269                          hr_avg += lavg->lg_loads[i];
3261 3270                  hr_avg = hr_avg / S_LOADAVG_SZ;
3262 3271                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3263 3272  
3264 3273                  /* Compute load avg. See comment in calcloadavg() */
3265 3274                  for (i = 0; i < 3; i++) {
3266 3275                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3267 3276                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3268 3277                          zp->zone_hp_avenrun[i] +=
3269 3278                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3270 3279  
3271 3280                          /* avenrun[] can only hold 31 bits of load avg. */
3272 3281                          if (zp->zone_hp_avenrun[i] <
3273 3282                              ((uint64_t)1<<(31+16-FSHIFT)))
3274 3283                                  zp->zone_avenrun[i] = (int32_t)
3275 3284                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3276 3285                          else
3277 3286                                  zp->zone_avenrun[i] = 0x7fffffff;
3278 3287                  }
3279 3288  
3280 3289                  mutex_exit(&zp->zone_lock);
3281 3290          }
3282 3291          mutex_exit(&zonehash_lock);
3283 3292  }
3284 3293  
3285 3294  /*
3286 3295   * Get the number of cpus visible to this zone.  The system-wide global
3287 3296   * 'ncpus' is returned if pools are disabled, the caller is in the
3288 3297   * global zone, or a NULL zone argument is passed in.
3289 3298   */
3290 3299  int
3291 3300  zone_ncpus_get(zone_t *zone)
3292 3301  {
3293 3302          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3294 3303  
3295 3304          return (myncpus != 0 ? myncpus : ncpus);
3296 3305  }
3297 3306  
3298 3307  /*
3299 3308   * Get the number of online cpus visible to this zone.  The system-wide
3300 3309   * global 'ncpus_online' is returned if pools are disabled, the caller
3301 3310   * is in the global zone, or a NULL zone argument is passed in.
3302 3311   */
3303 3312  int
3304 3313  zone_ncpus_online_get(zone_t *zone)
3305 3314  {
3306 3315          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3307 3316  
3308 3317          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3309 3318  }
3310 3319  
3311 3320  /*
3312 3321   * Return the pool to which the zone is currently bound.
3313 3322   */
3314 3323  pool_t *
3315 3324  zone_pool_get(zone_t *zone)
3316 3325  {
3317 3326          ASSERT(pool_lock_held());
3318 3327  
3319 3328          return (zone->zone_pool);
3320 3329  }
3321 3330  
3322 3331  /*
3323 3332   * Set the zone's pool pointer and update the zone's visibility to match
3324 3333   * the resources in the new pool.
3325 3334   */
3326 3335  void
3327 3336  zone_pool_set(zone_t *zone, pool_t *pool)
3328 3337  {
3329 3338          ASSERT(pool_lock_held());
3330 3339          ASSERT(MUTEX_HELD(&cpu_lock));
3331 3340  
3332 3341          zone->zone_pool = pool;
3333 3342          zone_pset_set(zone, pool->pool_pset->pset_id);
3334 3343  }
3335 3344  
3336 3345  /*
3337 3346   * Return the cached value of the id of the processor set to which the
3338 3347   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3339 3348   * facility is disabled.
3340 3349   */
3341 3350  psetid_t
3342 3351  zone_pset_get(zone_t *zone)
3343 3352  {
3344 3353          ASSERT(MUTEX_HELD(&cpu_lock));
3345 3354  
3346 3355          return (zone->zone_psetid);
3347 3356  }
3348 3357  
3349 3358  /*
3350 3359   * Set the cached value of the id of the processor set to which the zone
3351 3360   * is currently bound.  Also update the zone's visibility to match the
3352 3361   * resources in the new processor set.
3353 3362   */
3354 3363  void
3355 3364  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3356 3365  {
3357 3366          psetid_t oldpsetid;
3358 3367  
3359 3368          ASSERT(MUTEX_HELD(&cpu_lock));
3360 3369          oldpsetid = zone_pset_get(zone);
3361 3370  
3362 3371          if (oldpsetid == newpsetid)
3363 3372                  return;
3364 3373          /*
3365 3374           * Global zone sees all.
3366 3375           */
3367 3376          if (zone != global_zone) {
3368 3377                  zone->zone_psetid = newpsetid;
3369 3378                  if (newpsetid != ZONE_PS_INVAL)
3370 3379                          pool_pset_visibility_add(newpsetid, zone);
3371 3380                  if (oldpsetid != ZONE_PS_INVAL)
3372 3381                          pool_pset_visibility_remove(oldpsetid, zone);
3373 3382          }
3374 3383          /*
3375 3384           * Disabling pools, so we should start using the global values
3376 3385           * for ncpus and ncpus_online.
3377 3386           */
3378 3387          if (newpsetid == ZONE_PS_INVAL) {
3379 3388                  zone->zone_ncpus = 0;
3380 3389                  zone->zone_ncpus_online = 0;
3381 3390          }
3382 3391  }
3383 3392  
3384 3393  /*
3385 3394   * Walk the list of active zones and issue the provided callback for
3386 3395   * each of them.
3387 3396   *
3388 3397   * Caller must not be holding any locks that may be acquired under
3389 3398   * zonehash_lock.  See comment at the beginning of the file for a list of
3390 3399   * common locks and their interactions with zones.
3391 3400   */
3392 3401  int
3393 3402  zone_walk(int (*cb)(zone_t *, void *), void *data)
3394 3403  {
3395 3404          zone_t *zone;
3396 3405          int ret = 0;
3397 3406          zone_status_t status;
3398 3407  
3399 3408          mutex_enter(&zonehash_lock);
3400 3409          for (zone = list_head(&zone_active); zone != NULL;
3401 3410              zone = list_next(&zone_active, zone)) {
3402 3411                  /*
3403 3412                   * Skip zones that shouldn't be externally visible.
3404 3413                   */
3405 3414                  status = zone_status_get(zone);
3406 3415                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3407 3416                          continue;
3408 3417                  /*
3409 3418                   * Bail immediately if any callback invocation returns a
3410 3419                   * non-zero value.
3411 3420                   */
3412 3421                  ret = (*cb)(zone, data);
3413 3422                  if (ret != 0)
3414 3423                          break;
3415 3424          }
3416 3425          mutex_exit(&zonehash_lock);
3417 3426          return (ret);
3418 3427  }
3419 3428  
3420 3429  static int
3421 3430  zone_set_root(zone_t *zone, const char *upath)
3422 3431  {
3423 3432          vnode_t *vp;
3424 3433          int trycount;
3425 3434          int error = 0;
3426 3435          char *path;
3427 3436          struct pathname upn, pn;
3428 3437          size_t pathlen;
3429 3438  
3430 3439          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3431 3440                  return (error);
3432 3441  
3433 3442          pn_alloc(&pn);
3434 3443  
3435 3444          /* prevent infinite loop */
3436 3445          trycount = 10;
3437 3446          for (;;) {
3438 3447                  if (--trycount <= 0) {
3439 3448                          error = ESTALE;
3440 3449                          goto out;
3441 3450                  }
3442 3451  
3443 3452                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3444 3453                          /*
3445 3454                           * VOP_ACCESS() may cover 'vp' with a new
3446 3455                           * filesystem, if 'vp' is an autoFS vnode.
3447 3456                           * Get the new 'vp' if so.
3448 3457                           */
3449 3458                          if ((error =
3450 3459                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3451 3460                              (!vn_ismntpt(vp) ||
3452 3461                              (error = traverse(&vp)) == 0)) {
3453 3462                                  pathlen = pn.pn_pathlen + 2;
3454 3463                                  path = kmem_alloc(pathlen, KM_SLEEP);
3455 3464                                  (void) strncpy(path, pn.pn_path,
3456 3465                                      pn.pn_pathlen + 1);
3457 3466                                  path[pathlen - 2] = '/';
3458 3467                                  path[pathlen - 1] = '\0';
3459 3468                                  pn_free(&pn);
3460 3469                                  pn_free(&upn);
3461 3470  
3462 3471                                  /* Success! */
3463 3472                                  break;
3464 3473                          }
3465 3474                          VN_RELE(vp);
3466 3475                  }
3467 3476                  if (error != ESTALE)
3468 3477                          goto out;
3469 3478          }
3470 3479  
3471 3480          ASSERT(error == 0);
3472 3481          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3473 3482          zone->zone_rootpath = path;
3474 3483          zone->zone_rootpathlen = pathlen;
3475 3484          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3476 3485                  zone->zone_flags |= ZF_IS_SCRATCH;
3477 3486          return (0);
3478 3487  
3479 3488  out:
3480 3489          pn_free(&pn);
3481 3490          pn_free(&upn);
3482 3491          return (error);
3483 3492  }
3484 3493  
3485 3494  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3486 3495                          ((c) >= 'a' && (c) <= 'z') || \
3487 3496                          ((c) >= 'A' && (c) <= 'Z'))
3488 3497  
3489 3498  static int
3490 3499  zone_set_name(zone_t *zone, const char *uname)
3491 3500  {
3492 3501          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3493 3502          size_t len;
3494 3503          int i, err;
3495 3504  
3496 3505          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3497 3506                  kmem_free(kname, ZONENAME_MAX);
3498 3507                  return (err);   /* EFAULT or ENAMETOOLONG */
3499 3508          }
3500 3509  
3501 3510          /* must be less than ZONENAME_MAX */
3502 3511          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3503 3512                  kmem_free(kname, ZONENAME_MAX);
3504 3513                  return (EINVAL);
3505 3514          }
3506 3515  
3507 3516          /*
3508 3517           * Name must start with an alphanumeric and must contain only
3509 3518           * alphanumerics, '-', '_' and '.'.
3510 3519           */
3511 3520          if (!isalnum(kname[0])) {
3512 3521                  kmem_free(kname, ZONENAME_MAX);
3513 3522                  return (EINVAL);
3514 3523          }
3515 3524          for (i = 1; i < len - 1; i++) {
3516 3525                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3517 3526                      kname[i] != '.') {
3518 3527                          kmem_free(kname, ZONENAME_MAX);
3519 3528                          return (EINVAL);
3520 3529                  }
3521 3530          }
3522 3531  
3523 3532          zone->zone_name = kname;
3524 3533          return (0);
3525 3534  }
3526 3535  
3527 3536  /*
3528 3537   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3529 3538   * is NULL or it points to a zone with no hostid emulation, then the machine's
3530 3539   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3531 3540   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3532 3541   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3533 3542   * hostid and the machine's hostid is invalid.
3534 3543   */
3535 3544  uint32_t
3536 3545  zone_get_hostid(zone_t *zonep)
3537 3546  {
3538 3547          unsigned long machine_hostid;
3539 3548  
3540 3549          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3541 3550                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3542 3551                          return (HW_INVALID_HOSTID);
3543 3552                  return ((uint32_t)machine_hostid);
3544 3553          }
3545 3554          return (zonep->zone_hostid);
3546 3555  }
3547 3556  
3548 3557  /*
3549 3558   * Similar to thread_create(), but makes sure the thread is in the appropriate
3550 3559   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3551 3560   */
3552 3561  /*ARGSUSED*/
3553 3562  kthread_t *
3554 3563  zthread_create(
3555 3564      caddr_t stk,
3556 3565      size_t stksize,
3557 3566      void (*proc)(),
3558 3567      void *arg,
3559 3568      size_t len,
3560 3569      pri_t pri)
3561 3570  {
3562 3571          kthread_t *t;
3563 3572          zone_t *zone = curproc->p_zone;
3564 3573          proc_t *pp = zone->zone_zsched;
3565 3574  
3566 3575          zone_hold(zone);        /* Reference to be dropped when thread exits */
3567 3576  
3568 3577          /*
3569 3578           * No-one should be trying to create threads if the zone is shutting
3570 3579           * down and there aren't any kernel threads around.  See comment
3571 3580           * in zthread_exit().
3572 3581           */
3573 3582          ASSERT(!(zone->zone_kthreads == NULL &&
3574 3583              zone_status_get(zone) >= ZONE_IS_EMPTY));
3575 3584          /*
3576 3585           * Create a thread, but don't let it run until we've finished setting
3577 3586           * things up.
3578 3587           */
3579 3588          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3580 3589          ASSERT(t->t_forw == NULL);
3581 3590          mutex_enter(&zone_status_lock);
3582 3591          if (zone->zone_kthreads == NULL) {
3583 3592                  t->t_forw = t->t_back = t;
3584 3593          } else {
3585 3594                  kthread_t *tx = zone->zone_kthreads;
3586 3595  
3587 3596                  t->t_forw = tx;
3588 3597                  t->t_back = tx->t_back;
3589 3598                  tx->t_back->t_forw = t;
3590 3599                  tx->t_back = t;
3591 3600          }
3592 3601          zone->zone_kthreads = t;
3593 3602          mutex_exit(&zone_status_lock);
3594 3603  
3595 3604          mutex_enter(&pp->p_lock);
3596 3605          t->t_proc_flag |= TP_ZTHREAD;
3597 3606          project_rele(t->t_proj);
3598 3607          t->t_proj = project_hold(pp->p_task->tk_proj);
3599 3608  
3600 3609          /*
3601 3610           * Setup complete, let it run.
3602 3611           */
3603 3612          thread_lock(t);
3604 3613          t->t_schedflag |= TS_ALLSTART;
3605 3614          setrun_locked(t);
3606 3615          thread_unlock(t);
3607 3616  
3608 3617          mutex_exit(&pp->p_lock);
3609 3618  
3610 3619          return (t);
3611 3620  }
3612 3621  
3613 3622  /*
3614 3623   * Similar to thread_exit().  Must be called by threads created via
3615 3624   * zthread_exit().
3616 3625   */
3617 3626  void
3618 3627  zthread_exit(void)
3619 3628  {
3620 3629          kthread_t *t = curthread;
3621 3630          proc_t *pp = curproc;
3622 3631          zone_t *zone = pp->p_zone;
3623 3632  
3624 3633          mutex_enter(&zone_status_lock);
3625 3634  
3626 3635          /*
3627 3636           * Reparent to p0
3628 3637           */
3629 3638          kpreempt_disable();
3630 3639          mutex_enter(&pp->p_lock);
3631 3640          t->t_proc_flag &= ~TP_ZTHREAD;
3632 3641          t->t_procp = &p0;
3633 3642          hat_thread_exit(t);
3634 3643          mutex_exit(&pp->p_lock);
3635 3644          kpreempt_enable();
3636 3645  
3637 3646          if (t->t_back == t) {
3638 3647                  ASSERT(t->t_forw == t);
3639 3648                  /*
3640 3649                   * If the zone is empty, once the thread count
3641 3650                   * goes to zero no further kernel threads can be
3642 3651                   * created.  This is because if the creator is a process
3643 3652                   * in the zone, then it must have exited before the zone
3644 3653                   * state could be set to ZONE_IS_EMPTY.
3645 3654                   * Otherwise, if the creator is a kernel thread in the
3646 3655                   * zone, the thread count is non-zero.
3647 3656                   *
3648 3657                   * This really means that non-zone kernel threads should
3649 3658                   * not create zone kernel threads.
3650 3659                   */
3651 3660                  zone->zone_kthreads = NULL;
3652 3661                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3653 3662                          zone_status_set(zone, ZONE_IS_DOWN);
3654 3663                          /*
3655 3664                           * Remove any CPU caps on this zone.
3656 3665                           */
3657 3666                          cpucaps_zone_remove(zone);
3658 3667                  }
3659 3668          } else {
3660 3669                  t->t_forw->t_back = t->t_back;
3661 3670                  t->t_back->t_forw = t->t_forw;
3662 3671                  if (zone->zone_kthreads == t)
3663 3672                          zone->zone_kthreads = t->t_forw;
3664 3673          }
3665 3674          mutex_exit(&zone_status_lock);
3666 3675          zone_rele(zone);
3667 3676          thread_exit();
3668 3677          /* NOTREACHED */
3669 3678  }
3670 3679  
3671 3680  static void
3672 3681  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3673 3682  {
3674 3683          vnode_t *oldvp;
3675 3684  
3676 3685          /* we're going to hold a reference here to the directory */
3677 3686          VN_HOLD(vp);
3678 3687  
3679 3688          /* update abs cwd/root path see c2/audit.c */
3680 3689          if (AU_AUDITING())
3681 3690                  audit_chdirec(vp, vpp);
3682 3691  
3683 3692          mutex_enter(&pp->p_lock);
3684 3693          oldvp = *vpp;
3685 3694          *vpp = vp;
3686 3695          mutex_exit(&pp->p_lock);
3687 3696          if (oldvp != NULL)
3688 3697                  VN_RELE(oldvp);
3689 3698  }
3690 3699  
3691 3700  /*
3692 3701   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3693 3702   */
3694 3703  static int
3695 3704  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3696 3705  {
3697 3706          nvpair_t *nvp = NULL;
3698 3707          boolean_t priv_set = B_FALSE;
3699 3708          boolean_t limit_set = B_FALSE;
3700 3709          boolean_t action_set = B_FALSE;
3701 3710  
3702 3711          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3703 3712                  const char *name;
3704 3713                  uint64_t ui64;
3705 3714  
3706 3715                  name = nvpair_name(nvp);
3707 3716                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3708 3717                          return (EINVAL);
3709 3718                  (void) nvpair_value_uint64(nvp, &ui64);
3710 3719                  if (strcmp(name, "privilege") == 0) {
3711 3720                          /*
3712 3721                           * Currently only privileged values are allowed, but
3713 3722                           * this may change in the future.
3714 3723                           */
3715 3724                          if (ui64 != RCPRIV_PRIVILEGED)
3716 3725                                  return (EINVAL);
3717 3726                          rv->rcv_privilege = ui64;
3718 3727                          priv_set = B_TRUE;
3719 3728                  } else if (strcmp(name, "limit") == 0) {
3720 3729                          rv->rcv_value = ui64;
3721 3730                          limit_set = B_TRUE;
3722 3731                  } else if (strcmp(name, "action") == 0) {
3723 3732                          if (ui64 != RCTL_LOCAL_NOACTION &&
3724 3733                              ui64 != RCTL_LOCAL_DENY)
3725 3734                                  return (EINVAL);
3726 3735                          rv->rcv_flagaction = ui64;
3727 3736                          action_set = B_TRUE;
3728 3737                  } else {
3729 3738                          return (EINVAL);
3730 3739                  }
3731 3740          }
3732 3741  
3733 3742          if (!(priv_set && limit_set && action_set))
3734 3743                  return (EINVAL);
3735 3744          rv->rcv_action_signal = 0;
3736 3745          rv->rcv_action_recipient = NULL;
3737 3746          rv->rcv_action_recip_pid = -1;
3738 3747          rv->rcv_firing_time = 0;
3739 3748  
3740 3749          return (0);
3741 3750  }
3742 3751  
3743 3752  /*
3744 3753   * Non-global zone version of start_init.
3745 3754   */
3746 3755  void
3747 3756  zone_start_init(void)
3748 3757  {
3749 3758          proc_t *p = ttoproc(curthread);
3750 3759          zone_t *z = p->p_zone;
3751 3760  
3752 3761          ASSERT(!INGLOBALZONE(curproc));
3753 3762  
3754 3763          /*
3755 3764           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3756 3765           * storing just the pid of init is sufficient.
3757 3766           */
3758 3767          z->zone_proc_initpid = p->p_pid;
3759 3768  
3760 3769          /*
3761 3770           * We maintain zone_boot_err so that we can return the cause of the
3762 3771           * failure back to the caller of the zone_boot syscall.
3763 3772           */
3764 3773          p->p_zone->zone_boot_err = start_init_common();
3765 3774  
3766 3775          /*
3767 3776           * We will prevent booting zones from becoming running zones if the
3768 3777           * global zone is shutting down.
3769 3778           */
3770 3779          mutex_enter(&zone_status_lock);
3771 3780          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3772 3781              ZONE_IS_SHUTTING_DOWN) {
3773 3782                  /*
3774 3783                   * Make sure we are still in the booting state-- we could have
3775 3784                   * raced and already be shutting down, or even further along.
3776 3785                   */
3777 3786                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3778 3787                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3779 3788                  }
3780 3789                  mutex_exit(&zone_status_lock);
3781 3790                  /* It's gone bad, dispose of the process */
3782 3791                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3783 3792                          mutex_enter(&p->p_lock);
3784 3793                          ASSERT(p->p_flag & SEXITLWPS);
3785 3794                          lwp_exit();
3786 3795                  }
3787 3796          } else {
3788 3797                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3789 3798                          zone_status_set(z, ZONE_IS_RUNNING);
3790 3799                  mutex_exit(&zone_status_lock);
3791 3800                  /* cause the process to return to userland. */
3792 3801                  lwp_rtt();
3793 3802          }
3794 3803  }
3795 3804  
3796 3805  struct zsched_arg {
3797 3806          zone_t *zone;
3798 3807          nvlist_t *nvlist;
3799 3808  };
3800 3809  
3801 3810  /*
3802 3811   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3803 3812   * anything to do with scheduling, but rather with the fact that
3804 3813   * per-zone kernel threads are parented to zsched, just like regular
3805 3814   * kernel threads are parented to sched (p0).
3806 3815   *
3807 3816   * zsched is also responsible for launching init for the zone.
3808 3817   */
3809 3818  static void
3810 3819  zsched(void *arg)
3811 3820  {
3812 3821          struct zsched_arg *za = arg;
3813 3822          proc_t *pp = curproc;
3814 3823          proc_t *initp = proc_init;
3815 3824          zone_t *zone = za->zone;
3816 3825          cred_t *cr, *oldcred;
3817 3826          rctl_set_t *set;
3818 3827          rctl_alloc_gp_t *gp;
3819 3828          contract_t *ct = NULL;
3820 3829          task_t *tk, *oldtk;
3821 3830          rctl_entity_p_t e;
3822 3831          kproject_t *pj;
3823 3832  
3824 3833          nvlist_t *nvl = za->nvlist;
3825 3834          nvpair_t *nvp = NULL;
3826 3835  
3827 3836          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3828 3837          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3829 3838          PTOU(pp)->u_argc = 0;
3830 3839          PTOU(pp)->u_argv = NULL;
3831 3840          PTOU(pp)->u_envp = NULL;
3832 3841          PTOU(pp)->u_commpagep = NULL;
3833 3842          closeall(P_FINFO(pp));
3834 3843  
3835 3844          /*
3836 3845           * We are this zone's "zsched" process.  As the zone isn't generally
3837 3846           * visible yet we don't need to grab any locks before initializing its
3838 3847           * zone_proc pointer.
3839 3848           */
3840 3849          zone_hold(zone);  /* this hold is released by zone_destroy() */
3841 3850          zone->zone_zsched = pp;
3842 3851          mutex_enter(&pp->p_lock);
3843 3852          pp->p_zone = zone;
3844 3853          mutex_exit(&pp->p_lock);
3845 3854  
3846 3855          /*
3847 3856           * Disassociate process from its 'parent'; parent ourselves to init
3848 3857           * (pid 1) and change other values as needed.
3849 3858           */
3850 3859          sess_create();
3851 3860  
3852 3861          mutex_enter(&pidlock);
3853 3862          proc_detach(pp);
3854 3863          pp->p_ppid = 1;
3855 3864          pp->p_flag |= SZONETOP;
3856 3865          pp->p_ancpid = 1;
3857 3866          pp->p_parent = initp;
3858 3867          pp->p_psibling = NULL;
3859 3868          if (initp->p_child)
3860 3869                  initp->p_child->p_psibling = pp;
3861 3870          pp->p_sibling = initp->p_child;
3862 3871          initp->p_child = pp;
3863 3872  
3864 3873          /* Decrement what newproc() incremented. */
3865 3874          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3866 3875          /*
3867 3876           * Our credentials are about to become kcred-like, so we don't care
3868 3877           * about the caller's ruid.
3869 3878           */
3870 3879          upcount_inc(crgetruid(kcred), zone->zone_id);
3871 3880          mutex_exit(&pidlock);
3872 3881  
3873 3882          /*
3874 3883           * getting out of global zone, so decrement lwp and process counts
3875 3884           */
3876 3885          pj = pp->p_task->tk_proj;
3877 3886          mutex_enter(&global_zone->zone_nlwps_lock);
3878 3887          pj->kpj_nlwps -= pp->p_lwpcnt;
3879 3888          global_zone->zone_nlwps -= pp->p_lwpcnt;
3880 3889          pj->kpj_nprocs--;
3881 3890          global_zone->zone_nprocs--;
3882 3891          mutex_exit(&global_zone->zone_nlwps_lock);
3883 3892  
3884 3893          /*
3885 3894           * Decrement locked memory counts on old zone and project.
3886 3895           */
3887 3896          mutex_enter(&global_zone->zone_mem_lock);
3888 3897          global_zone->zone_locked_mem -= pp->p_locked_mem;
3889 3898          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3890 3899          mutex_exit(&global_zone->zone_mem_lock);
3891 3900  
3892 3901          /*
3893 3902           * Create and join a new task in project '0' of this zone.
3894 3903           *
3895 3904           * We don't need to call holdlwps() since we know we're the only lwp in
3896 3905           * this process.
3897 3906           *
3898 3907           * task_join() returns with p_lock held.
3899 3908           */
3900 3909          tk = task_create(0, zone);
3901 3910          mutex_enter(&cpu_lock);
3902 3911          oldtk = task_join(tk, 0);
3903 3912  
3904 3913          pj = pp->p_task->tk_proj;
3905 3914  
3906 3915          mutex_enter(&zone->zone_mem_lock);
3907 3916          zone->zone_locked_mem += pp->p_locked_mem;
3908 3917          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3909 3918          mutex_exit(&zone->zone_mem_lock);
3910 3919  
3911 3920          /*
3912 3921           * add lwp and process counts to zsched's zone, and increment
3913 3922           * project's task and process count due to the task created in
3914 3923           * the above task_create.
3915 3924           */
3916 3925          mutex_enter(&zone->zone_nlwps_lock);
3917 3926          pj->kpj_nlwps += pp->p_lwpcnt;
3918 3927          pj->kpj_ntasks += 1;
3919 3928          zone->zone_nlwps += pp->p_lwpcnt;
3920 3929          pj->kpj_nprocs++;
3921 3930          zone->zone_nprocs++;
3922 3931          mutex_exit(&zone->zone_nlwps_lock);
3923 3932  
3924 3933          mutex_exit(&curproc->p_lock);
3925 3934          mutex_exit(&cpu_lock);
3926 3935          task_rele(oldtk);
3927 3936  
3928 3937          /*
3929 3938           * The process was created by a process in the global zone, hence the
3930 3939           * credentials are wrong.  We might as well have kcred-ish credentials.
3931 3940           */
3932 3941          cr = zone->zone_kcred;
3933 3942          crhold(cr);
3934 3943          mutex_enter(&pp->p_crlock);
3935 3944          oldcred = pp->p_cred;
3936 3945          pp->p_cred = cr;
3937 3946          mutex_exit(&pp->p_crlock);
3938 3947          crfree(oldcred);
3939 3948  
3940 3949          /*
3941 3950           * Hold credentials again (for thread)
3942 3951           */
3943 3952          crhold(cr);
3944 3953  
3945 3954          /*
3946 3955           * p_lwpcnt can't change since this is a kernel process.
3947 3956           */
3948 3957          crset(pp, cr);
3949 3958  
3950 3959          /*
3951 3960           * Chroot
3952 3961           */
3953 3962          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3954 3963          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3955 3964  
3956 3965          /*
3957 3966           * Initialize zone's rctl set.
3958 3967           */
3959 3968          set = rctl_set_create();
3960 3969          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3961 3970          mutex_enter(&pp->p_lock);
3962 3971          e.rcep_p.zone = zone;
3963 3972          e.rcep_t = RCENTITY_ZONE;
3964 3973          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3965 3974          mutex_exit(&pp->p_lock);
3966 3975          rctl_prealloc_destroy(gp);
3967 3976  
3968 3977          /*
3969 3978           * Apply the rctls passed in to zone_create().  This is basically a list
3970 3979           * assignment: all of the old values are removed and the new ones
3971 3980           * inserted.  That is, if an empty list is passed in, all values are
3972 3981           * removed.
3973 3982           */
3974 3983          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3975 3984                  rctl_dict_entry_t *rde;
3976 3985                  rctl_hndl_t hndl;
3977 3986                  char *name;
3978 3987                  nvlist_t **nvlarray;
3979 3988                  uint_t i, nelem;
3980 3989                  int error;      /* For ASSERT()s */
3981 3990  
3982 3991                  name = nvpair_name(nvp);
3983 3992                  hndl = rctl_hndl_lookup(name);
3984 3993                  ASSERT(hndl != -1);
3985 3994                  rde = rctl_dict_lookup_hndl(hndl);
3986 3995                  ASSERT(rde != NULL);
3987 3996  
3988 3997                  for (; /* ever */; ) {
3989 3998                          rctl_val_t oval;
3990 3999  
3991 4000                          mutex_enter(&pp->p_lock);
3992 4001                          error = rctl_local_get(hndl, NULL, &oval, pp);
3993 4002                          mutex_exit(&pp->p_lock);
3994 4003                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3995 4004                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3996 4005                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
3997 4006                                  break;
3998 4007                          mutex_enter(&pp->p_lock);
3999 4008                          error = rctl_local_delete(hndl, &oval, pp);
4000 4009                          mutex_exit(&pp->p_lock);
4001 4010                          ASSERT(error == 0);
4002 4011                  }
4003 4012                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4004 4013                  ASSERT(error == 0);
4005 4014                  for (i = 0; i < nelem; i++) {
4006 4015                          rctl_val_t *nvalp;
4007 4016  
4008 4017                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4009 4018                          error = nvlist2rctlval(nvlarray[i], nvalp);
4010 4019                          ASSERT(error == 0);
4011 4020                          /*
4012 4021                           * rctl_local_insert can fail if the value being
4013 4022                           * inserted is a duplicate; this is OK.
4014 4023                           */
4015 4024                          mutex_enter(&pp->p_lock);
4016 4025                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
4017 4026                                  kmem_cache_free(rctl_val_cache, nvalp);
4018 4027                          mutex_exit(&pp->p_lock);
4019 4028                  }
4020 4029          }
4021 4030  
4022 4031          /*
4023 4032           * Tell the world that we're done setting up.
4024 4033           *
4025 4034           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4026 4035           * and atomically set the zone's processor set visibility.  Once
4027 4036           * we drop pool_lock() this zone will automatically get updated
4028 4037           * to reflect any future changes to the pools configuration.
4029 4038           *
4030 4039           * Note that after we drop the locks below (zonehash_lock in
4031 4040           * particular) other operations such as a zone_getattr call can
4032 4041           * now proceed and observe the zone. That is the reason for doing a
4033 4042           * state transition to the INITIALIZED state.
4034 4043           */
4035 4044          pool_lock();
4036 4045          mutex_enter(&cpu_lock);
4037 4046          mutex_enter(&zonehash_lock);
4038 4047          zone_uniqid(zone);
4039 4048          zone_zsd_configure(zone);
4040 4049          if (pool_state == POOL_ENABLED)
4041 4050                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
4042 4051          mutex_enter(&zone_status_lock);
4043 4052          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4044 4053          zone_status_set(zone, ZONE_IS_INITIALIZED);
4045 4054          mutex_exit(&zone_status_lock);
4046 4055          mutex_exit(&zonehash_lock);
4047 4056          mutex_exit(&cpu_lock);
4048 4057          pool_unlock();
4049 4058  
4050 4059          /* Now call the create callback for this key */
4051 4060          zsd_apply_all_keys(zsd_apply_create, zone);
4052 4061  
4053 4062          /* The callbacks are complete. Mark ZONE_IS_READY */
4054 4063          mutex_enter(&zone_status_lock);
4055 4064          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4056 4065          zone_status_set(zone, ZONE_IS_READY);
4057 4066          mutex_exit(&zone_status_lock);
4058 4067  
4059 4068          /*
4060 4069           * Once we see the zone transition to the ZONE_IS_BOOTING state,
4061 4070           * we launch init, and set the state to running.
4062 4071           */
4063 4072          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4064 4073  
4065 4074          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4066 4075                  id_t cid;
4067 4076  
4068 4077                  /*
4069 4078                   * Ok, this is a little complicated.  We need to grab the
4070 4079                   * zone's pool's scheduling class ID; note that by now, we
4071 4080                   * are already bound to a pool if we need to be (zoneadmd
4072 4081                   * will have done that to us while we're in the READY
4073 4082                   * state).  *But* the scheduling class for the zone's 'init'
4074 4083                   * must be explicitly passed to newproc, which doesn't
4075 4084                   * respect pool bindings.
4076 4085                   *
4077 4086                   * We hold the pool_lock across the call to newproc() to
4078 4087                   * close the obvious race: the pool's scheduling class
4079 4088                   * could change before we manage to create the LWP with
4080 4089                   * classid 'cid'.
4081 4090                   */
4082 4091                  pool_lock();
4083 4092                  if (zone->zone_defaultcid > 0)
4084 4093                          cid = zone->zone_defaultcid;
4085 4094                  else
4086 4095                          cid = pool_get_class(zone->zone_pool);
4087 4096                  if (cid == -1)
4088 4097                          cid = defaultcid;
4089 4098  
4090 4099                  /*
4091 4100                   * If this fails, zone_boot will ultimately fail.  The
4092 4101                   * state of the zone will be set to SHUTTING_DOWN-- userland
4093 4102                   * will have to tear down the zone, and fail, or try again.
4094 4103                   */
4095 4104                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4096 4105                      minclsyspri - 1, &ct, 0)) != 0) {
4097 4106                          mutex_enter(&zone_status_lock);
4098 4107                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4099 4108                          mutex_exit(&zone_status_lock);
4100 4109                  } else {
4101 4110                          zone->zone_boot_time = gethrestime_sec();
4102 4111                  }
4103 4112  
4104 4113                  pool_unlock();
4105 4114          }
4106 4115  
4107 4116          /*
4108 4117           * Wait for zone_destroy() to be called.  This is what we spend
4109 4118           * most of our life doing.
4110 4119           */
4111 4120          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4112 4121  
4113 4122          if (ct)
4114 4123                  /*
4115 4124                   * At this point the process contract should be empty.
4116 4125                   * (Though if it isn't, it's not the end of the world.)
4117 4126                   */
4118 4127                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4119 4128  
4120 4129          /*
4121 4130           * Allow kcred to be freed when all referring processes
4122 4131           * (including this one) go away.  We can't just do this in
4123 4132           * zone_free because we need to wait for the zone_cred_ref to
4124 4133           * drop to 0 before calling zone_free, and the existence of
4125 4134           * zone_kcred will prevent that.  Thus, we call crfree here to
4126 4135           * balance the crdup in zone_create.  The crhold calls earlier
4127 4136           * in zsched will be dropped when the thread and process exit.
4128 4137           */
4129 4138          crfree(zone->zone_kcred);
4130 4139          zone->zone_kcred = NULL;
4131 4140  
4132 4141          exit(CLD_EXITED, 0);
4133 4142  }
4134 4143  
4135 4144  /*
4136 4145   * Helper function to determine if there are any submounts of the
4137 4146   * provided path.  Used to make sure the zone doesn't "inherit" any
4138 4147   * mounts from before it is created.
4139 4148   */
4140 4149  static uint_t
4141 4150  zone_mount_count(const char *rootpath)
4142 4151  {
4143 4152          vfs_t *vfsp;
4144 4153          uint_t count = 0;
4145 4154          size_t rootpathlen = strlen(rootpath);
4146 4155  
4147 4156          /*
4148 4157           * Holding zonehash_lock prevents race conditions with
4149 4158           * vfs_list_add()/vfs_list_remove() since we serialize with
4150 4159           * zone_find_by_path().
4151 4160           */
4152 4161          ASSERT(MUTEX_HELD(&zonehash_lock));
4153 4162          /*
4154 4163           * The rootpath must end with a '/'
4155 4164           */
4156 4165          ASSERT(rootpath[rootpathlen - 1] == '/');
4157 4166  
4158 4167          /*
4159 4168           * This intentionally does not count the rootpath itself if that
4160 4169           * happens to be a mount point.
4161 4170           */
4162 4171          vfs_list_read_lock();
4163 4172          vfsp = rootvfs;
4164 4173          do {
4165 4174                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4166 4175                      rootpathlen) == 0)
4167 4176                          count++;
4168 4177                  vfsp = vfsp->vfs_next;
4169 4178          } while (vfsp != rootvfs);
4170 4179          vfs_list_unlock();
4171 4180          return (count);
4172 4181  }
4173 4182  
4174 4183  /*
4175 4184   * Helper function to make sure that a zone created on 'rootpath'
4176 4185   * wouldn't end up containing other zones' rootpaths.
4177 4186   */
4178 4187  static boolean_t
4179 4188  zone_is_nested(const char *rootpath)
4180 4189  {
4181 4190          zone_t *zone;
4182 4191          size_t rootpathlen = strlen(rootpath);
4183 4192          size_t len;
4184 4193  
4185 4194          ASSERT(MUTEX_HELD(&zonehash_lock));
4186 4195  
4187 4196          /*
4188 4197           * zone_set_root() appended '/' and '\0' at the end of rootpath
4189 4198           */
4190 4199          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4191 4200              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4192 4201                  return (B_TRUE);
4193 4202  
4194 4203          for (zone = list_head(&zone_active); zone != NULL;
4195 4204              zone = list_next(&zone_active, zone)) {
4196 4205                  if (zone == global_zone)
4197 4206                          continue;
4198 4207                  len = strlen(zone->zone_rootpath);
4199 4208                  if (strncmp(rootpath, zone->zone_rootpath,
4200 4209                      MIN(rootpathlen, len)) == 0)
4201 4210                          return (B_TRUE);
4202 4211          }
4203 4212          return (B_FALSE);
4204 4213  }
4205 4214  
4206 4215  static int
4207 4216  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4208 4217      size_t zone_privssz)
4209 4218  {
4210 4219          priv_set_t *privs;
4211 4220  
4212 4221          if (zone_privssz < sizeof (priv_set_t))
4213 4222                  return (ENOMEM);
4214 4223  
4215 4224          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4216 4225  
4217 4226          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4218 4227                  kmem_free(privs, sizeof (priv_set_t));
4219 4228                  return (EFAULT);
4220 4229          }
4221 4230  
4222 4231          zone->zone_privset = privs;
4223 4232          return (0);
4224 4233  }

↓ open down ↓

978 lines elided

↑ open up ↑

4225 4234  
4226 4235  /*
4227 4236   * We make creative use of nvlists to pass in rctls from userland.  The list is
4228 4237   * a list of the following structures:
4229 4238   *
4230 4239   * (name = rctl_name, value = nvpair_list_array)
4231 4240   *
4232 4241   * Where each element of the nvpair_list_array is of the form:
4233 4242   *
4234 4243   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4235      - *      (name = "limit", value = uint64_t),
4236      - *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
     4244 + *      (name = "limit", value = uint64_t),
     4245 + *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4237 4246   */
4238 4247  static int
4239 4248  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4240 4249  {
4241 4250          nvpair_t *nvp = NULL;
4242 4251          nvlist_t *nvl = NULL;
4243 4252          char *kbuf;
4244 4253          int error;
4245 4254          rctl_val_t rv;
4246 4255

4247 4256          *nvlp = NULL;
4248 4257  
4249 4258          if (buflen == 0)
4250 4259                  return (0);
4251 4260  
4252 4261          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4253 4262                  return (ENOMEM);
4254 4263          if (copyin(ubuf, kbuf, buflen)) {
4255 4264                  error = EFAULT;
4256 4265                  goto out;
4257 4266          }
4258 4267          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4259 4268                  /*
4260 4269                   * nvl may have been allocated/free'd, but the value set to
4261 4270                   * non-NULL, so we reset it here.
4262 4271                   */
4263 4272                  nvl = NULL;
4264 4273                  error = EINVAL;
4265 4274                  goto out;
4266 4275          }
4267 4276          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4268 4277                  rctl_dict_entry_t *rde;
4269 4278                  rctl_hndl_t hndl;
4270 4279                  nvlist_t **nvlarray;
4271 4280                  uint_t i, nelem;
4272 4281                  char *name;
4273 4282  
4274 4283                  error = EINVAL;
4275 4284                  name = nvpair_name(nvp);
4276 4285                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4277 4286                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4278 4287                          goto out;
4279 4288                  }
4280 4289                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4281 4290                          goto out;
4282 4291                  }
4283 4292                  rde = rctl_dict_lookup_hndl(hndl);
4284 4293                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4285 4294                  ASSERT(error == 0);
4286 4295                  for (i = 0; i < nelem; i++) {
4287 4296                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4288 4297                                  goto out;
4289 4298                  }
4290 4299                  if (rctl_invalid_value(rde, &rv)) {
4291 4300                          error = EINVAL;
4292 4301                          goto out;
4293 4302                  }
4294 4303          }
4295 4304          error = 0;
4296 4305          *nvlp = nvl;
4297 4306  out:
4298 4307          kmem_free(kbuf, buflen);
4299 4308          if (error && nvl != NULL)
4300 4309                  nvlist_free(nvl);
4301 4310          return (error);
4302 4311  }
4303 4312  
4304 4313  int
4305 4314  zone_create_error(int er_error, int er_ext, int *er_out)
4306 4315  {
4307 4316          if (er_out != NULL) {
4308 4317                  if (copyout(&er_ext, er_out, sizeof (int))) {
4309 4318                          return (set_errno(EFAULT));
4310 4319                  }
4311 4320          }
4312 4321          return (set_errno(er_error));
4313 4322  }
4314 4323  
4315 4324  static int
4316 4325  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4317 4326  {
4318 4327          ts_label_t *tsl;
4319 4328          bslabel_t blab;
4320 4329  
4321 4330          /* Get label from user */
4322 4331          if (copyin(lab, &blab, sizeof (blab)) != 0)
4323 4332                  return (EFAULT);
4324 4333          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4325 4334          if (tsl == NULL)
4326 4335                  return (ENOMEM);
4327 4336  
4328 4337          zone->zone_slabel = tsl;
4329 4338          return (0);
4330 4339  }
4331 4340  
4332 4341  /*
4333 4342   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4334 4343   */
4335 4344  static int
4336 4345  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4337 4346  {
4338 4347          char *kbuf;
4339 4348          char *dataset, *next;
4340 4349          zone_dataset_t *zd;
4341 4350          size_t len;
4342 4351  
4343 4352          if (ubuf == NULL || buflen == 0)
4344 4353                  return (0);
4345 4354  
4346 4355          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4347 4356                  return (ENOMEM);
4348 4357  
4349 4358          if (copyin(ubuf, kbuf, buflen) != 0) {
4350 4359                  kmem_free(kbuf, buflen);
4351 4360                  return (EFAULT);
4352 4361          }
4353 4362  
4354 4363          dataset = next = kbuf;
4355 4364          for (;;) {
4356 4365                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4357 4366  
4358 4367                  next = strchr(dataset, ',');
4359 4368  
4360 4369                  if (next == NULL)
4361 4370                          len = strlen(dataset);
4362 4371                  else
4363 4372                          len = next - dataset;
4364 4373  
4365 4374                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4366 4375                  bcopy(dataset, zd->zd_dataset, len);
4367 4376                  zd->zd_dataset[len] = '\0';
4368 4377  
4369 4378                  list_insert_head(&zone->zone_datasets, zd);
4370 4379  
4371 4380                  if (next == NULL)
4372 4381                          break;
4373 4382  
4374 4383                  dataset = next + 1;
4375 4384          }
4376 4385  
4377 4386          kmem_free(kbuf, buflen);
4378 4387          return (0);
4379 4388  }
4380 4389  
4381 4390  /*
4382 4391   * System call to create/initialize a new zone named 'zone_name', rooted
4383 4392   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4384 4393   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4385 4394   * with labeling set by 'match', 'doi', and 'label'.
4386 4395   *
4387 4396   * If extended error is non-null, we may use it to return more detailed
4388 4397   * error information.
4389 4398   */
4390 4399  static zoneid_t
4391 4400  zone_create(const char *zone_name, const char *zone_root,
4392 4401      const priv_set_t *zone_privs, size_t zone_privssz,
4393 4402      caddr_t rctlbuf, size_t rctlbufsz,
4394 4403      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4395 4404      int match, uint32_t doi, const bslabel_t *label,
4396 4405      int flags)
4397 4406  {
4398 4407          struct zsched_arg zarg;
4399 4408          nvlist_t *rctls = NULL;
4400 4409          proc_t *pp = curproc;
4401 4410          zone_t *zone, *ztmp;
4402 4411          zoneid_t zoneid, start = GLOBAL_ZONEID;
4403 4412          int error;
4404 4413          int error2 = 0;
4405 4414          char *str;
4406 4415          cred_t *zkcr;
4407 4416          boolean_t insert_label_hash;
4408 4417  
4409 4418          if (secpolicy_zone_config(CRED()) != 0)
4410 4419                  return (set_errno(EPERM));
4411 4420  
4412 4421          /* can't boot zone from within chroot environment */
4413 4422          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4414 4423                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4415 4424                      extended_error));
4416 4425          /*
4417 4426           * As the first step of zone creation, we want to allocate a zoneid.
4418 4427           * This allocation is complicated by the fact that netstacks use the
4419 4428           * zoneid to determine their stackid, but netstacks themselves are
4420 4429           * freed asynchronously with respect to zone destruction.  This means
4421 4430           * that a netstack reference leak (or in principle, an extraordinarily
4422 4431           * long netstack reference hold) could result in a zoneid being
4423 4432           * allocated that in fact corresponds to a stackid from an active
4424 4433           * (referenced) netstack -- unleashing all sorts of havoc when that
4425 4434           * netstack is actually (re)used.  (In the abstract, we might wish a
4426 4435           * zoneid to not be deallocated until its last referencing netstack
4427 4436           * has been released, but netstacks lack a backpointer into their
4428 4437           * referencing zone -- and changing them to have such a pointer would
4429 4438           * be substantial, to put it euphemistically.)  To avoid this, we
4430 4439           * detect this condition on allocation: if we have allocated a zoneid
4431 4440           * that corresponds to a netstack that's still in use, we warn about
4432 4441           * it (as it is much more likely to be a reference leak than an actual
4433 4442           * netstack reference), free it, and allocate another.  That these
4434 4443           * identifers are allocated out of an ID space assures that we won't
4435 4444           * see the identifier we just allocated.
4436 4445           */
4437 4446          for (;;) {
4438 4447                  zoneid = id_alloc(zoneid_space);
4439 4448  
4440 4449                  if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4441 4450                          break;
4442 4451  
4443 4452                  id_free(zoneid_space, zoneid);
4444 4453  
4445 4454                  if (start == GLOBAL_ZONEID) {
4446 4455                          start = zoneid;
4447 4456                  } else if (zoneid == start) {
4448 4457                          /*
4449 4458                           * We have managed to iterate over the entire available
4450 4459                           * zoneid space -- there are no identifiers available,
4451 4460                           * presumably due to some number of leaked netstack
4452 4461                           * references.  While it's in principle possible for us
4453 4462                           * to continue to try, it seems wiser to give up at
4454 4463                           * this point to warn and fail explicitly with a
4455 4464                           * distinctive error.
4456 4465                           */
4457 4466                          cmn_err(CE_WARN, "zone_create() failed: all available "
4458 4467                              "zone IDs have netstacks still in use");
4459 4468                          return (set_errno(ENFILE));
4460 4469                  }
4461 4470  
4462 4471                  cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4463 4472                      "netstack still in use", zoneid);
4464 4473          }
4465 4474  
4466 4475          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4467 4476          zone->zone_id = zoneid;
4468 4477          zone->zone_status = ZONE_IS_UNINITIALIZED;
4469 4478          zone->zone_pool = pool_default;
4470 4479          zone->zone_pool_mod = gethrtime();
4471 4480          zone->zone_psetid = ZONE_PS_INVAL;
4472 4481          zone->zone_ncpus = 0;
4473 4482          zone->zone_ncpus_online = 0;
4474 4483          zone->zone_restart_init = B_TRUE;
4475 4484          zone->zone_brand = &native_brand;
4476 4485          zone->zone_initname = NULL;
4477 4486          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4478 4487          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4479 4488          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4480 4489          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4481 4490          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4482 4491              offsetof(zone_ref_t, zref_linkage));
4483 4492          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4484 4493              offsetof(struct zsd_entry, zsd_linkage));
4485 4494          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4486 4495              offsetof(zone_dataset_t, zd_linkage));
4487 4496          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4488 4497              offsetof(zone_dl_t, zdl_linkage));
4489 4498          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4490 4499          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4491 4500  
4492 4501          if (flags & ZCF_NET_EXCL) {
4493 4502                  zone->zone_flags |= ZF_NET_EXCL;
4494 4503          }
4495 4504  
4496 4505          if ((error = zone_set_name(zone, zone_name)) != 0) {
4497 4506                  zone_free(zone);
4498 4507                  return (zone_create_error(error, 0, extended_error));
4499 4508          }
4500 4509  
4501 4510          if ((error = zone_set_root(zone, zone_root)) != 0) {
4502 4511                  zone_free(zone);
4503 4512                  return (zone_create_error(error, 0, extended_error));
4504 4513          }
4505 4514          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4506 4515                  zone_free(zone);
4507 4516                  return (zone_create_error(error, 0, extended_error));
4508 4517          }
4509 4518  
4510 4519          /* initialize node name to be the same as zone name */
4511 4520          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4512 4521          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4513 4522          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4514 4523  
4515 4524          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);

↓ open down ↓

269 lines elided

↑ open up ↑

4516 4525          zone->zone_domain[0] = '\0';
4517 4526          zone->zone_hostid = HW_INVALID_HOSTID;
4518 4527          zone->zone_shares = 1;
4519 4528          zone->zone_shmmax = 0;
4520 4529          zone->zone_ipc.ipcq_shmmni = 0;
4521 4530          zone->zone_ipc.ipcq_semmni = 0;
4522 4531          zone->zone_ipc.ipcq_msgmni = 0;
4523 4532          zone->zone_bootargs = NULL;
4524 4533          zone->zone_fs_allowed = NULL;
4525 4534  
4526      -        secflags_zero(&zone0.zone_secflags.psf_lower);
4527      -        secflags_zero(&zone0.zone_secflags.psf_effective);
4528      -        secflags_zero(&zone0.zone_secflags.psf_inherit);
4529      -        secflags_fullset(&zone0.zone_secflags.psf_upper);
     4535 +        psecflags_default(&zone->zone_secflags);
4530 4536  
4531 4537          zone->zone_initname =
4532 4538              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4533 4539          (void) strcpy(zone->zone_initname, zone_default_initname);
4534 4540          zone->zone_nlwps = 0;
4535 4541          zone->zone_nlwps_ctl = INT_MAX;
4536 4542          zone->zone_nprocs = 0;
4537 4543          zone->zone_nprocs_ctl = INT_MAX;
4538 4544          zone->zone_locked_mem = 0;
4539 4545          zone->zone_locked_mem_ctl = UINT64_MAX;
4540 4546          zone->zone_max_swap = 0;
4541 4547          zone->zone_max_swap_ctl = UINT64_MAX;
4542 4548          zone->zone_max_lofi = 0;
4543 4549          zone->zone_max_lofi_ctl = UINT64_MAX;
4544 4550          zone0.zone_lockedmem_kstat = NULL;
4545 4551          zone0.zone_swapresv_kstat = NULL;
4546 4552  
     4553 +        zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
     4554 +
4547 4555          /*
4548 4556           * Zsched initializes the rctls.
4549 4557           */
4550 4558          zone->zone_rctls = NULL;
4551 4559  
4552 4560          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4553 4561                  zone_free(zone);
4554 4562                  return (zone_create_error(error, 0, extended_error));
4555 4563          }
4556 4564

4557 4565          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4558 4566                  zone_free(zone);
4559 4567                  return (set_errno(error));
4560 4568          }
4561 4569  
4562 4570          /*
4563 4571           * Read in the trusted system parameters:
4564 4572           * match flag and sensitivity label.
4565 4573           */
4566 4574          zone->zone_match = match;
4567 4575          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4568 4576                  /* Fail if requested to set doi to anything but system's doi */
4569 4577                  if (doi != 0 && doi != default_doi) {
4570 4578                          zone_free(zone);
4571 4579                          return (set_errno(EINVAL));
4572 4580                  }
4573 4581                  /* Always apply system's doi to the zone */
4574 4582                  error = zone_set_label(zone, label, default_doi);
4575 4583                  if (error != 0) {
4576 4584                          zone_free(zone);
4577 4585                          return (set_errno(error));
4578 4586                  }
4579 4587                  insert_label_hash = B_TRUE;
4580 4588          } else {
4581 4589                  /* all zones get an admin_low label if system is not labeled */
4582 4590                  zone->zone_slabel = l_admin_low;
4583 4591                  label_hold(l_admin_low);
4584 4592                  insert_label_hash = B_FALSE;
4585 4593          }
4586 4594  
4587 4595          /*
4588 4596           * Stop all lwps since that's what normally happens as part of fork().
4589 4597           * This needs to happen before we grab any locks to avoid deadlock
4590 4598           * (another lwp in the process could be waiting for the held lock).
4591 4599           */
4592 4600          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4593 4601                  zone_free(zone);
4594 4602                  nvlist_free(rctls);
4595 4603                  return (zone_create_error(error, 0, extended_error));
4596 4604          }
4597 4605  
4598 4606          if (block_mounts(zone) == 0) {
4599 4607                  mutex_enter(&pp->p_lock);
4600 4608                  if (curthread != pp->p_agenttp)
4601 4609                          continuelwps(pp);
4602 4610                  mutex_exit(&pp->p_lock);
4603 4611                  zone_free(zone);
4604 4612                  nvlist_free(rctls);
4605 4613                  return (zone_create_error(error, 0, extended_error));
4606 4614          }
4607 4615  
4608 4616          /*
4609 4617           * Set up credential for kernel access.  After this, any errors
4610 4618           * should go through the dance in errout rather than calling
4611 4619           * zone_free directly.
4612 4620           */
4613 4621          zone->zone_kcred = crdup(kcred);
4614 4622          crsetzone(zone->zone_kcred, zone);
4615 4623          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4616 4624          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4617 4625          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4618 4626          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4619 4627  
4620 4628          mutex_enter(&zonehash_lock);
4621 4629          /*
4622 4630           * Make sure zone doesn't already exist.
4623 4631           *
4624 4632           * If the system and zone are labeled,
4625 4633           * make sure no other zone exists that has the same label.
4626 4634           */
4627 4635          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4628 4636              (insert_label_hash &&
4629 4637              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4630 4638                  zone_status_t status;
4631 4639  
4632 4640                  status = zone_status_get(ztmp);
4633 4641                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4634 4642                          error = EEXIST;
4635 4643                  else
4636 4644                          error = EBUSY;
4637 4645  
4638 4646                  if (insert_label_hash)
4639 4647                          error2 = ZE_LABELINUSE;
4640 4648  
4641 4649                  goto errout;
4642 4650          }
4643 4651  
4644 4652          /*
4645 4653           * Don't allow zone creations which would cause one zone's rootpath to
4646 4654           * be accessible from that of another (non-global) zone.
4647 4655           */
4648 4656          if (zone_is_nested(zone->zone_rootpath)) {
4649 4657                  error = EBUSY;
4650 4658                  goto errout;
4651 4659          }
4652 4660  
4653 4661          ASSERT(zonecount != 0);         /* check for leaks */
4654 4662          if (zonecount + 1 > maxzones) {
4655 4663                  error = ENOMEM;
4656 4664                  goto errout;
4657 4665          }
4658 4666  
4659 4667          if (zone_mount_count(zone->zone_rootpath) != 0) {
4660 4668                  error = EBUSY;
4661 4669                  error2 = ZE_AREMOUNTS;
4662 4670                  goto errout;
4663 4671          }
4664 4672  
4665 4673          /*
4666 4674           * Zone is still incomplete, but we need to drop all locks while
4667 4675           * zsched() initializes this zone's kernel process.  We
4668 4676           * optimistically add the zone to the hashtable and associated
4669 4677           * lists so a parallel zone_create() doesn't try to create the
4670 4678           * same zone.
4671 4679           */
4672 4680          zonecount++;
4673 4681          (void) mod_hash_insert(zonehashbyid,
4674 4682              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4675 4683              (mod_hash_val_t)(uintptr_t)zone);
4676 4684          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4677 4685          (void) strcpy(str, zone->zone_name);
4678 4686          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4679 4687              (mod_hash_val_t)(uintptr_t)zone);
4680 4688          if (insert_label_hash) {
4681 4689                  (void) mod_hash_insert(zonehashbylabel,
4682 4690                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4683 4691                  zone->zone_flags |= ZF_HASHED_LABEL;
4684 4692          }
4685 4693  
4686 4694          /*
4687 4695           * Insert into active list.  At this point there are no 'hold's
4688 4696           * on the zone, but everyone else knows not to use it, so we can
4689 4697           * continue to use it.  zsched() will do a zone_hold() if the
4690 4698           * newproc() is successful.
4691 4699           */
4692 4700          list_insert_tail(&zone_active, zone);
4693 4701          mutex_exit(&zonehash_lock);
4694 4702  
4695 4703          zarg.zone = zone;
4696 4704          zarg.nvlist = rctls;
4697 4705          /*
4698 4706           * The process, task, and project rctls are probably wrong;
4699 4707           * we need an interface to get the default values of all rctls,
4700 4708           * and initialize zsched appropriately.  I'm not sure that that
4701 4709           * makes much of a difference, though.
4702 4710           */
4703 4711          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4704 4712          if (error != 0) {
4705 4713                  /*
4706 4714                   * We need to undo all globally visible state.
4707 4715                   */
4708 4716                  mutex_enter(&zonehash_lock);
4709 4717                  list_remove(&zone_active, zone);
4710 4718                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4711 4719                          ASSERT(zone->zone_slabel != NULL);
4712 4720                          (void) mod_hash_destroy(zonehashbylabel,
4713 4721                              (mod_hash_key_t)zone->zone_slabel);
4714 4722                  }
4715 4723                  (void) mod_hash_destroy(zonehashbyname,
4716 4724                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4717 4725                  (void) mod_hash_destroy(zonehashbyid,
4718 4726                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4719 4727                  ASSERT(zonecount > 1);
4720 4728                  zonecount--;
4721 4729                  goto errout;
4722 4730          }
4723 4731  
4724 4732          /*
4725 4733           * Zone creation can't fail from now on.
4726 4734           */
4727 4735  
4728 4736          /*
4729 4737           * Create zone kstats
4730 4738           */
4731 4739          zone_kstat_create(zone);
4732 4740  
4733 4741          /*
4734 4742           * Let the other lwps continue.
4735 4743           */
4736 4744          mutex_enter(&pp->p_lock);
4737 4745          if (curthread != pp->p_agenttp)
4738 4746                  continuelwps(pp);
4739 4747          mutex_exit(&pp->p_lock);
4740 4748  
4741 4749          /*
4742 4750           * Wait for zsched to finish initializing the zone.
4743 4751           */
4744 4752          zone_status_wait(zone, ZONE_IS_READY);
4745 4753          /*
4746 4754           * The zone is fully visible, so we can let mounts progress.
4747 4755           */
4748 4756          resume_mounts(zone);
4749 4757          nvlist_free(rctls);
4750 4758  
4751 4759          return (zoneid);
4752 4760  
4753 4761  errout:
4754 4762          mutex_exit(&zonehash_lock);
4755 4763          /*
4756 4764           * Let the other lwps continue.
4757 4765           */
4758 4766          mutex_enter(&pp->p_lock);
4759 4767          if (curthread != pp->p_agenttp)
4760 4768                  continuelwps(pp);
4761 4769          mutex_exit(&pp->p_lock);
4762 4770  
4763 4771          resume_mounts(zone);
4764 4772          nvlist_free(rctls);
4765 4773          /*
4766 4774           * There is currently one reference to the zone, a cred_ref from
4767 4775           * zone_kcred.  To free the zone, we call crfree, which will call
4768 4776           * zone_cred_rele, which will call zone_free.
4769 4777           */
4770 4778          ASSERT(zone->zone_cred_ref == 1);
4771 4779          ASSERT(zone->zone_kcred->cr_ref == 1);
4772 4780          ASSERT(zone->zone_ref == 0);
4773 4781          zkcr = zone->zone_kcred;
4774 4782          zone->zone_kcred = NULL;
4775 4783          crfree(zkcr);                           /* triggers call to zone_free */
4776 4784          return (zone_create_error(error, error2, extended_error));
4777 4785  }
4778 4786  
4779 4787  /*
4780 4788   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4781 4789   * the heavy lifting.  initname is the path to the program to launch
4782 4790   * at the "top" of the zone; if this is NULL, we use the system default,
4783 4791   * which is stored at zone_default_initname.
4784 4792   */
4785 4793  static int
4786 4794  zone_boot(zoneid_t zoneid)
4787 4795  {
4788 4796          int err;
4789 4797          zone_t *zone;
4790 4798  
4791 4799          if (secpolicy_zone_config(CRED()) != 0)
4792 4800                  return (set_errno(EPERM));
4793 4801          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4794 4802                  return (set_errno(EINVAL));
4795 4803  
4796 4804          mutex_enter(&zonehash_lock);
4797 4805          /*
4798 4806           * Look for zone under hash lock to prevent races with calls to
4799 4807           * zone_shutdown, zone_destroy, etc.
4800 4808           */
4801 4809          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4802 4810                  mutex_exit(&zonehash_lock);
4803 4811                  return (set_errno(EINVAL));
4804 4812          }
4805 4813  
4806 4814          mutex_enter(&zone_status_lock);
4807 4815          if (zone_status_get(zone) != ZONE_IS_READY) {
4808 4816                  mutex_exit(&zone_status_lock);
4809 4817                  mutex_exit(&zonehash_lock);
4810 4818                  return (set_errno(EINVAL));
4811 4819          }
4812 4820          zone_status_set(zone, ZONE_IS_BOOTING);
4813 4821          mutex_exit(&zone_status_lock);
4814 4822  
4815 4823          zone_hold(zone);        /* so we can use the zone_t later */
4816 4824          mutex_exit(&zonehash_lock);
4817 4825  
4818 4826          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4819 4827                  zone_rele(zone);
4820 4828                  return (set_errno(EINTR));
4821 4829          }
4822 4830  
4823 4831          /*
4824 4832           * Boot (starting init) might have failed, in which case the zone
4825 4833           * will go to the SHUTTING_DOWN state; an appropriate errno will
4826 4834           * be placed in zone->zone_boot_err, and so we return that.
4827 4835           */
4828 4836          err = zone->zone_boot_err;
4829 4837          zone_rele(zone);
4830 4838          return (err ? set_errno(err) : 0);
4831 4839  }
4832 4840  
4833 4841  /*
4834 4842   * Kills all user processes in the zone, waiting for them all to exit
4835 4843   * before returning.
4836 4844   */
4837 4845  static int
4838 4846  zone_empty(zone_t *zone)
4839 4847  {
4840 4848          int waitstatus;
4841 4849  
4842 4850          /*
4843 4851           * We need to drop zonehash_lock before killing all
4844 4852           * processes, otherwise we'll deadlock with zone_find_*
4845 4853           * which can be called from the exit path.
4846 4854           */
4847 4855          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4848 4856          while ((waitstatus = zone_status_timedwait_sig(zone,
4849 4857              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4850 4858                  killall(zone->zone_id);
4851 4859          }
4852 4860          /*
4853 4861           * return EINTR if we were signaled
4854 4862           */
4855 4863          if (waitstatus == 0)
4856 4864                  return (EINTR);
4857 4865          return (0);
4858 4866  }
4859 4867  
4860 4868  /*
4861 4869   * This function implements the policy for zone visibility.
4862 4870   *
4863 4871   * In standard Solaris, a non-global zone can only see itself.
4864 4872   *
4865 4873   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4866 4874   * it dominates. For this test, the label of the global zone is treated as
4867 4875   * admin_high so it is special-cased instead of being checked for dominance.
4868 4876   *
4869 4877   * Returns true if zone attributes are viewable, false otherwise.
4870 4878   */
4871 4879  static boolean_t
4872 4880  zone_list_access(zone_t *zone)
4873 4881  {
4874 4882  
4875 4883          if (curproc->p_zone == global_zone ||
4876 4884              curproc->p_zone == zone) {
4877 4885                  return (B_TRUE);
4878 4886          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4879 4887                  bslabel_t *curproc_label;
4880 4888                  bslabel_t *zone_label;
4881 4889  
4882 4890                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4883 4891                  zone_label = label2bslabel(zone->zone_slabel);
4884 4892  
4885 4893                  if (zone->zone_id != GLOBAL_ZONEID &&
4886 4894                      bldominates(curproc_label, zone_label)) {
4887 4895                          return (B_TRUE);
4888 4896                  } else {
4889 4897                          return (B_FALSE);
4890 4898                  }
4891 4899          } else {
4892 4900                  return (B_FALSE);
4893 4901          }
4894 4902  }
4895 4903  
4896 4904  /*
4897 4905   * Systemcall to start the zone's halt sequence.  By the time this
4898 4906   * function successfully returns, all user processes and kernel threads
4899 4907   * executing in it will have exited, ZSD shutdown callbacks executed,
4900 4908   * and the zone status set to ZONE_IS_DOWN.
4901 4909   *
4902 4910   * It is possible that the call will interrupt itself if the caller is the
4903 4911   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4904 4912   */
4905 4913  static int
4906 4914  zone_shutdown(zoneid_t zoneid)
4907 4915  {
4908 4916          int error;
4909 4917          zone_t *zone;
4910 4918          zone_status_t status;
4911 4919  
4912 4920          if (secpolicy_zone_config(CRED()) != 0)
4913 4921                  return (set_errno(EPERM));
4914 4922          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4915 4923                  return (set_errno(EINVAL));
4916 4924  
4917 4925          mutex_enter(&zonehash_lock);
4918 4926          /*
4919 4927           * Look for zone under hash lock to prevent races with other
4920 4928           * calls to zone_shutdown and zone_destroy.
4921 4929           */
4922 4930          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4923 4931                  mutex_exit(&zonehash_lock);
4924 4932                  return (set_errno(EINVAL));
4925 4933          }
4926 4934  
4927 4935          /*
4928 4936           * We have to drop zonehash_lock before calling block_mounts.
4929 4937           * Hold the zone so we can continue to use the zone_t.
4930 4938           */
4931 4939          zone_hold(zone);
4932 4940          mutex_exit(&zonehash_lock);
4933 4941  
4934 4942          /*
4935 4943           * Block mounts so that VFS_MOUNT() can get an accurate view of
4936 4944           * the zone's status with regards to ZONE_IS_SHUTTING down.
4937 4945           *
4938 4946           * e.g. NFS can fail the mount if it determines that the zone
4939 4947           * has already begun the shutdown sequence.
4940 4948           *
4941 4949           */
4942 4950          if (block_mounts(zone) == 0) {
4943 4951                  zone_rele(zone);
4944 4952                  return (set_errno(EINTR));
4945 4953          }
4946 4954  
4947 4955          mutex_enter(&zonehash_lock);
4948 4956          mutex_enter(&zone_status_lock);
4949 4957          status = zone_status_get(zone);
4950 4958          /*
4951 4959           * Fail if the zone isn't fully initialized yet.
4952 4960           */
4953 4961          if (status < ZONE_IS_READY) {
4954 4962                  mutex_exit(&zone_status_lock);
4955 4963                  mutex_exit(&zonehash_lock);
4956 4964                  resume_mounts(zone);
4957 4965                  zone_rele(zone);
4958 4966                  return (set_errno(EINVAL));
4959 4967          }
4960 4968          /*
4961 4969           * If conditions required for zone_shutdown() to return have been met,
4962 4970           * return success.
4963 4971           */
4964 4972          if (status >= ZONE_IS_DOWN) {
4965 4973                  mutex_exit(&zone_status_lock);
4966 4974                  mutex_exit(&zonehash_lock);
4967 4975                  resume_mounts(zone);
4968 4976                  zone_rele(zone);
4969 4977                  return (0);
4970 4978          }
4971 4979          /*
4972 4980           * If zone_shutdown() hasn't been called before, go through the motions.
4973 4981           * If it has, there's nothing to do but wait for the kernel threads to
4974 4982           * drain.
4975 4983           */
4976 4984          if (status < ZONE_IS_EMPTY) {
4977 4985                  uint_t ntasks;
4978 4986  
4979 4987                  mutex_enter(&zone->zone_lock);
4980 4988                  if ((ntasks = zone->zone_ntasks) != 1) {
4981 4989                          /*
4982 4990                           * There's still stuff running.
4983 4991                           */
4984 4992                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4985 4993                  }
4986 4994                  mutex_exit(&zone->zone_lock);
4987 4995                  if (ntasks == 1) {
4988 4996                          /*
4989 4997                           * The only way to create another task is through
4990 4998                           * zone_enter(), which will block until we drop
4991 4999                           * zonehash_lock.  The zone is empty.
4992 5000                           */
4993 5001                          if (zone->zone_kthreads == NULL) {
4994 5002                                  /*
4995 5003                                   * Skip ahead to ZONE_IS_DOWN
4996 5004                                   */
4997 5005                                  zone_status_set(zone, ZONE_IS_DOWN);
4998 5006                          } else {
4999 5007                                  zone_status_set(zone, ZONE_IS_EMPTY);
5000 5008                          }
5001 5009                  }
5002 5010          }
5003 5011          mutex_exit(&zone_status_lock);
5004 5012          mutex_exit(&zonehash_lock);
5005 5013          resume_mounts(zone);
5006 5014  
5007 5015          if (error = zone_empty(zone)) {
5008 5016                  zone_rele(zone);
5009 5017                  return (set_errno(error));
5010 5018          }
5011 5019          /*
5012 5020           * After the zone status goes to ZONE_IS_DOWN this zone will no
5013 5021           * longer be notified of changes to the pools configuration, so
5014 5022           * in order to not end up with a stale pool pointer, we point
5015 5023           * ourselves at the default pool and remove all resource
5016 5024           * visibility.  This is especially important as the zone_t may
5017 5025           * languish on the deathrow for a very long time waiting for
5018 5026           * cred's to drain out.
5019 5027           *
5020 5028           * This rebinding of the zone can happen multiple times
5021 5029           * (presumably due to interrupted or parallel systemcalls)
5022 5030           * without any adverse effects.
5023 5031           */
5024 5032          if (pool_lock_intr() != 0) {
5025 5033                  zone_rele(zone);
5026 5034                  return (set_errno(EINTR));
5027 5035          }
5028 5036          if (pool_state == POOL_ENABLED) {
5029 5037                  mutex_enter(&cpu_lock);
5030 5038                  zone_pool_set(zone, pool_default);
5031 5039                  /*
5032 5040                   * The zone no longer needs to be able to see any cpus.
5033 5041                   */
5034 5042                  zone_pset_set(zone, ZONE_PS_INVAL);
5035 5043                  mutex_exit(&cpu_lock);
5036 5044          }
5037 5045          pool_unlock();
5038 5046  
5039 5047          /*
5040 5048           * ZSD shutdown callbacks can be executed multiple times, hence
5041 5049           * it is safe to not be holding any locks across this call.
5042 5050           */
5043 5051          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5044 5052  
5045 5053          mutex_enter(&zone_status_lock);
5046 5054          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5047 5055                  zone_status_set(zone, ZONE_IS_DOWN);
5048 5056          mutex_exit(&zone_status_lock);
5049 5057  
5050 5058          /*
5051 5059           * Wait for kernel threads to drain.
5052 5060           */
5053 5061          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5054 5062                  zone_rele(zone);
5055 5063                  return (set_errno(EINTR));
5056 5064          }
5057 5065  
5058 5066          /*
5059 5067           * Zone can be become down/destroyable even if the above wait
5060 5068           * returns EINTR, so any code added here may never execute.
5061 5069           * (i.e. don't add code here)
5062 5070           */
5063 5071  
5064 5072          zone_rele(zone);
5065 5073          return (0);
5066 5074  }
5067 5075  
5068 5076  /*
5069 5077   * Log the specified zone's reference counts.  The caller should not be
5070 5078   * holding the zone's zone_lock.
5071 5079   */
5072 5080  static void
5073 5081  zone_log_refcounts(zone_t *zone)
5074 5082  {
5075 5083          char *buffer;
5076 5084          char *buffer_position;
5077 5085          uint32_t buffer_size;
5078 5086          uint32_t index;
5079 5087          uint_t ref;
5080 5088          uint_t cred_ref;
5081 5089  
5082 5090          /*
5083 5091           * Construct a string representing the subsystem-specific reference
5084 5092           * counts.  The counts are printed in ascending order by index into the
5085 5093           * zone_t::zone_subsys_ref array.  The list will be surrounded by
5086 5094           * square brackets [] and will only contain nonzero reference counts.
5087 5095           *
5088 5096           * The buffer will hold two square bracket characters plus ten digits,
5089 5097           * one colon, one space, one comma, and some characters for a
5090 5098           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5091 5099           * bit integers have at most ten decimal digits.)  The last
5092 5100           * reference count's comma is replaced by the closing square
5093 5101           * bracket and a NULL character to terminate the string.
5094 5102           *
5095 5103           * NOTE: We have to grab the zone's zone_lock to create a consistent
5096 5104           * snapshot of the zone's reference counters.
5097 5105           *
5098 5106           * First, figure out how much space the string buffer will need.
5099 5107           * The buffer's size is stored in buffer_size.
5100 5108           */
5101 5109          buffer_size = 2;                        /* for the square brackets */
5102 5110          mutex_enter(&zone->zone_lock);
5103 5111          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5104 5112          ref = zone->zone_ref;
5105 5113          cred_ref = zone->zone_cred_ref;
5106 5114          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5107 5115                  if (zone->zone_subsys_ref[index] != 0)
5108 5116                          buffer_size += strlen(zone_ref_subsys_names[index]) +
5109 5117                              13;
5110 5118          if (buffer_size == 2) {
5111 5119                  /*
5112 5120                   * No subsystems had nonzero reference counts.  Don't bother
5113 5121                   * with allocating a buffer; just log the general-purpose and
5114 5122                   * credential reference counts.
5115 5123                   */
5116 5124                  mutex_exit(&zone->zone_lock);
5117 5125                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5118 5126                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
5119 5127                      "references and %u credential references are still extant",
5120 5128                      zone->zone_name, zone->zone_id, ref, cred_ref);
5121 5129                  return;
5122 5130          }
5123 5131  
5124 5132          /*
5125 5133           * buffer_size contains the exact number of characters that the
5126 5134           * buffer will need.  Allocate the buffer and fill it with nonzero
5127 5135           * subsystem-specific reference counts.  Surround the results with
5128 5136           * square brackets afterwards.
5129 5137           */
5130 5138          buffer = kmem_alloc(buffer_size, KM_SLEEP);
5131 5139          buffer_position = &buffer[1];
5132 5140          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5133 5141                  /*
5134 5142                   * NOTE: The DDI's version of sprintf() returns a pointer to
5135 5143                   * the modified buffer rather than the number of bytes written
5136 5144                   * (as in snprintf(3C)).  This is unfortunate and annoying.
5137 5145                   * Therefore, we'll use snprintf() with INT_MAX to get the
5138 5146                   * number of bytes written.  Using INT_MAX is safe because
5139 5147                   * the buffer is perfectly sized for the data: we'll never
5140 5148                   * overrun the buffer.
5141 5149                   */
5142 5150                  if (zone->zone_subsys_ref[index] != 0)
5143 5151                          buffer_position += snprintf(buffer_position, INT_MAX,
5144 5152                              "%s: %u,", zone_ref_subsys_names[index],
5145 5153                              zone->zone_subsys_ref[index]);
5146 5154          }
5147 5155          mutex_exit(&zone->zone_lock);
5148 5156          buffer[0] = '[';
5149 5157          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5150 5158          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5151 5159          buffer_position[-1] = ']';
5152 5160  
5153 5161          /*
5154 5162           * Log the reference counts and free the message buffer.
5155 5163           */
5156 5164          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5157 5165              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5158 5166              "%u credential references are still extant %s", zone->zone_name,
5159 5167              zone->zone_id, ref, cred_ref, buffer);
5160 5168          kmem_free(buffer, buffer_size);
5161 5169  }
5162 5170  
5163 5171  /*
5164 5172   * Systemcall entry point to finalize the zone halt process.  The caller
5165 5173   * must have already successfully called zone_shutdown().
5166 5174   *
5167 5175   * Upon successful completion, the zone will have been fully destroyed:
5168 5176   * zsched will have exited, destructor callbacks executed, and the zone
5169 5177   * removed from the list of active zones.
5170 5178   */
5171 5179  static int
5172 5180  zone_destroy(zoneid_t zoneid)
5173 5181  {
5174 5182          uint64_t uniqid;
5175 5183          zone_t *zone;
5176 5184          zone_status_t status;
5177 5185          clock_t wait_time;
5178 5186          boolean_t log_refcounts;
5179 5187  
5180 5188          if (secpolicy_zone_config(CRED()) != 0)
5181 5189                  return (set_errno(EPERM));
5182 5190          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5183 5191                  return (set_errno(EINVAL));
5184 5192  
5185 5193          mutex_enter(&zonehash_lock);
5186 5194          /*
5187 5195           * Look for zone under hash lock to prevent races with other
5188 5196           * calls to zone_destroy.
5189 5197           */
5190 5198          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5191 5199                  mutex_exit(&zonehash_lock);
5192 5200                  return (set_errno(EINVAL));
5193 5201          }
5194 5202  
5195 5203          if (zone_mount_count(zone->zone_rootpath) != 0) {
5196 5204                  mutex_exit(&zonehash_lock);
5197 5205                  return (set_errno(EBUSY));
5198 5206          }
5199 5207          mutex_enter(&zone_status_lock);
5200 5208          status = zone_status_get(zone);
5201 5209          if (status < ZONE_IS_DOWN) {
5202 5210                  mutex_exit(&zone_status_lock);
5203 5211                  mutex_exit(&zonehash_lock);
5204 5212                  return (set_errno(EBUSY));
5205 5213          } else if (status == ZONE_IS_DOWN) {
5206 5214                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5207 5215          }
5208 5216          mutex_exit(&zone_status_lock);
5209 5217          zone_hold(zone);
5210 5218          mutex_exit(&zonehash_lock);
5211 5219  
5212 5220          /*
5213 5221           * wait for zsched to exit
5214 5222           */
5215 5223          zone_status_wait(zone, ZONE_IS_DEAD);
5216 5224          zone_zsd_callbacks(zone, ZSD_DESTROY);
5217 5225          zone->zone_netstack = NULL;
5218 5226          uniqid = zone->zone_uniqid;
5219 5227          zone_rele(zone);
5220 5228          zone = NULL;    /* potentially free'd */
5221 5229  
5222 5230          log_refcounts = B_FALSE;
5223 5231          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5224 5232          mutex_enter(&zonehash_lock);
5225 5233          for (; /* ever */; ) {
5226 5234                  boolean_t unref;
5227 5235                  boolean_t refs_have_been_logged;
5228 5236  
5229 5237                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5230 5238                      zone->zone_uniqid != uniqid) {
5231 5239                          /*
5232 5240                           * The zone has gone away.  Necessary conditions
5233 5241                           * are met, so we return success.
5234 5242                           */
5235 5243                          mutex_exit(&zonehash_lock);
5236 5244                          return (0);
5237 5245                  }
5238 5246                  mutex_enter(&zone->zone_lock);
5239 5247                  unref = ZONE_IS_UNREF(zone);
5240 5248                  refs_have_been_logged = (zone->zone_flags &
5241 5249                      ZF_REFCOUNTS_LOGGED);
5242 5250                  mutex_exit(&zone->zone_lock);
5243 5251                  if (unref) {
5244 5252                          /*
5245 5253                           * There is only one reference to the zone -- that
5246 5254                           * added when the zone was added to the hashtables --
5247 5255                           * and things will remain this way until we drop
5248 5256                           * zonehash_lock... we can go ahead and cleanup the
5249 5257                           * zone.
5250 5258                           */
5251 5259                          break;
5252 5260                  }
5253 5261  
5254 5262                  /*
5255 5263                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5256 5264                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5257 5265                   * some zone's general-purpose reference count reaches one.
5258 5266                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5259 5267                   * on zone_destroy_cv, then log the zone's reference counts and
5260 5268                   * continue to wait for zone_rele() and zone_cred_rele().
5261 5269                   */
5262 5270                  if (!refs_have_been_logged) {
5263 5271                          if (!log_refcounts) {
5264 5272                                  /*
5265 5273                                   * This thread hasn't timed out waiting on
5266 5274                                   * zone_destroy_cv yet.  Wait wait_time clock
5267 5275                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5268 5276                                   * seconds) for the zone's references to clear.
5269 5277                                   */
5270 5278                                  ASSERT(wait_time > 0);
5271 5279                                  wait_time = cv_reltimedwait_sig(
5272 5280                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5273 5281                                      TR_SEC);
5274 5282                                  if (wait_time > 0) {
5275 5283                                          /*
5276 5284                                           * A thread in zone_rele() or
5277 5285                                           * zone_cred_rele() signaled
5278 5286                                           * zone_destroy_cv before this thread's
5279 5287                                           * wait timed out.  The zone might have
5280 5288                                           * only one reference left; find out!
5281 5289                                           */
5282 5290                                          continue;
5283 5291                                  } else if (wait_time == 0) {
5284 5292                                          /* The thread's process was signaled. */
5285 5293                                          mutex_exit(&zonehash_lock);
5286 5294                                          return (set_errno(EINTR));
5287 5295                                  }
5288 5296  
5289 5297                                  /*
5290 5298                                   * The thread timed out while waiting on
5291 5299                                   * zone_destroy_cv.  Even though the thread
5292 5300                                   * timed out, it has to check whether another
5293 5301                                   * thread woke up from zone_destroy_cv and
5294 5302                                   * destroyed the zone.
5295 5303                                   *
5296 5304                                   * If the zone still exists and has more than
5297 5305                                   * one unreleased general-purpose reference,
5298 5306                                   * then log the zone's reference counts.
5299 5307                                   */
5300 5308                                  log_refcounts = B_TRUE;
5301 5309                                  continue;
5302 5310                          }
5303 5311  
5304 5312                          /*
5305 5313                           * The thread already timed out on zone_destroy_cv while
5306 5314                           * waiting for subsystems to release the zone's last
5307 5315                           * general-purpose references.  Log the zone's reference
5308 5316                           * counts and wait indefinitely on zone_destroy_cv.
5309 5317                           */
5310 5318                          zone_log_refcounts(zone);
5311 5319                  }
5312 5320                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5313 5321                          /* The thread's process was signaled. */
5314 5322                          mutex_exit(&zonehash_lock);
5315 5323                          return (set_errno(EINTR));
5316 5324                  }
5317 5325          }
5318 5326  
5319 5327          /*
5320 5328           * Remove CPU cap for this zone now since we're not going to
5321 5329           * fail below this point.
5322 5330           */
5323 5331          cpucaps_zone_remove(zone);
5324 5332  
5325 5333          /* Get rid of the zone's kstats */
5326 5334          zone_kstat_delete(zone);
5327 5335  
5328 5336          /* remove the pfexecd doors */
5329 5337          if (zone->zone_pfexecd != NULL) {
5330 5338                  klpd_freelist(&zone->zone_pfexecd);
5331 5339                  zone->zone_pfexecd = NULL;
5332 5340          }
5333 5341  
5334 5342          /* free brand specific data */
5335 5343          if (ZONE_IS_BRANDED(zone))
5336 5344                  ZBROP(zone)->b_free_brand_data(zone);
5337 5345  
5338 5346          /* Say goodbye to brand framework. */
5339 5347          brand_unregister_zone(zone->zone_brand);
5340 5348  
5341 5349          /*
5342 5350           * It is now safe to let the zone be recreated; remove it from the
5343 5351           * lists.  The memory will not be freed until the last cred
5344 5352           * reference goes away.
5345 5353           */
5346 5354          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5347 5355          zonecount--;
5348 5356          /* remove from active list and hash tables */
5349 5357          list_remove(&zone_active, zone);
5350 5358          (void) mod_hash_destroy(zonehashbyname,
5351 5359              (mod_hash_key_t)zone->zone_name);
5352 5360          (void) mod_hash_destroy(zonehashbyid,
5353 5361              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5354 5362          if (zone->zone_flags & ZF_HASHED_LABEL)
5355 5363                  (void) mod_hash_destroy(zonehashbylabel,
5356 5364                      (mod_hash_key_t)zone->zone_slabel);
5357 5365          mutex_exit(&zonehash_lock);
5358 5366  
5359 5367          /*
5360 5368           * Release the root vnode; we're not using it anymore.  Nor should any
5361 5369           * other thread that might access it exist.
5362 5370           */
5363 5371          if (zone->zone_rootvp != NULL) {
5364 5372                  VN_RELE(zone->zone_rootvp);
5365 5373                  zone->zone_rootvp = NULL;
5366 5374          }
5367 5375  
5368 5376          /* add to deathrow list */
5369 5377          mutex_enter(&zone_deathrow_lock);
5370 5378          list_insert_tail(&zone_deathrow, zone);
5371 5379          mutex_exit(&zone_deathrow_lock);
5372 5380  
5373 5381          /*
5374 5382           * Drop last reference (which was added by zsched()), this will
5375 5383           * free the zone unless there are outstanding cred references.
5376 5384           */
5377 5385          zone_rele(zone);
5378 5386          return (0);
5379 5387  }
5380 5388  
5381 5389  /*
5382 5390   * Systemcall entry point for zone_getattr(2).
5383 5391   */
5384 5392  static ssize_t
5385 5393  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5386 5394  {
5387 5395          size_t size;
5388 5396          int error = 0, err;
5389 5397          zone_t *zone;
5390 5398          char *zonepath;
5391 5399          char *outstr;
5392 5400          zone_status_t zone_status;
5393 5401          pid_t initpid;
5394 5402          boolean_t global = (curzone == global_zone);
5395 5403          boolean_t inzone = (curzone->zone_id == zoneid);
5396 5404          ushort_t flags;
5397 5405          zone_net_data_t *zbuf;
5398 5406  
5399 5407          mutex_enter(&zonehash_lock);
5400 5408          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5401 5409                  mutex_exit(&zonehash_lock);
5402 5410                  return (set_errno(EINVAL));
5403 5411          }
5404 5412          zone_status = zone_status_get(zone);
5405 5413          if (zone_status < ZONE_IS_INITIALIZED) {
5406 5414                  mutex_exit(&zonehash_lock);
5407 5415                  return (set_errno(EINVAL));
5408 5416          }
5409 5417          zone_hold(zone);
5410 5418          mutex_exit(&zonehash_lock);
5411 5419  
5412 5420          /*
5413 5421           * If not in the global zone, don't show information about other zones,
5414 5422           * unless the system is labeled and the local zone's label dominates
5415 5423           * the other zone.
5416 5424           */
5417 5425          if (!zone_list_access(zone)) {
5418 5426                  zone_rele(zone);
5419 5427                  return (set_errno(EINVAL));
5420 5428          }
5421 5429  
5422 5430          switch (attr) {
5423 5431          case ZONE_ATTR_ROOT:
5424 5432                  if (global) {
5425 5433                          /*
5426 5434                           * Copy the path to trim the trailing "/" (except for
5427 5435                           * the global zone).
5428 5436                           */
5429 5437                          if (zone != global_zone)
5430 5438                                  size = zone->zone_rootpathlen - 1;
5431 5439                          else
5432 5440                                  size = zone->zone_rootpathlen;
5433 5441                          zonepath = kmem_alloc(size, KM_SLEEP);
5434 5442                          bcopy(zone->zone_rootpath, zonepath, size);
5435 5443                          zonepath[size - 1] = '\0';
5436 5444                  } else {
5437 5445                          if (inzone || !is_system_labeled()) {
5438 5446                                  /*
5439 5447                                   * Caller is not in the global zone.
5440 5448                                   * if the query is on the current zone
5441 5449                                   * or the system is not labeled,
5442 5450                                   * just return faked-up path for current zone.
5443 5451                                   */
5444 5452                                  zonepath = "/";
5445 5453                                  size = 2;
5446 5454                          } else {
5447 5455                                  /*
5448 5456                                   * Return related path for current zone.
5449 5457                                   */
5450 5458                                  int prefix_len = strlen(zone_prefix);
5451 5459                                  int zname_len = strlen(zone->zone_name);
5452 5460  
5453 5461                                  size = prefix_len + zname_len + 1;
5454 5462                                  zonepath = kmem_alloc(size, KM_SLEEP);
5455 5463                                  bcopy(zone_prefix, zonepath, prefix_len);
5456 5464                                  bcopy(zone->zone_name, zonepath +
5457 5465                                      prefix_len, zname_len);
5458 5466                                  zonepath[size - 1] = '\0';
5459 5467                          }
5460 5468                  }
5461 5469                  if (bufsize > size)
5462 5470                          bufsize = size;
5463 5471                  if (buf != NULL) {
5464 5472                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5465 5473                          if (err != 0 && err != ENAMETOOLONG)
5466 5474                                  error = EFAULT;
5467 5475                  }
5468 5476                  if (global || (is_system_labeled() && !inzone))
5469 5477                          kmem_free(zonepath, size);
5470 5478                  break;
5471 5479  
5472 5480          case ZONE_ATTR_NAME:
5473 5481                  size = strlen(zone->zone_name) + 1;
5474 5482                  if (bufsize > size)
5475 5483                          bufsize = size;
5476 5484                  if (buf != NULL) {
5477 5485                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5478 5486                          if (err != 0 && err != ENAMETOOLONG)
5479 5487                                  error = EFAULT;
5480 5488                  }
5481 5489                  break;
5482 5490  
5483 5491          case ZONE_ATTR_STATUS:
5484 5492                  /*
5485 5493                   * Since we're not holding zonehash_lock, the zone status
5486 5494                   * may be anything; leave it up to userland to sort it out.
5487 5495                   */
5488 5496                  size = sizeof (zone_status);
5489 5497                  if (bufsize > size)
5490 5498                          bufsize = size;
5491 5499                  zone_status = zone_status_get(zone);
5492 5500                  if (buf != NULL &&
5493 5501                      copyout(&zone_status, buf, bufsize) != 0)
5494 5502                          error = EFAULT;
5495 5503                  break;
5496 5504          case ZONE_ATTR_FLAGS:
5497 5505                  size = sizeof (zone->zone_flags);
5498 5506                  if (bufsize > size)
5499 5507                          bufsize = size;
5500 5508                  flags = zone->zone_flags;
5501 5509                  if (buf != NULL &&
5502 5510                      copyout(&flags, buf, bufsize) != 0)
5503 5511                          error = EFAULT;
5504 5512                  break;
5505 5513          case ZONE_ATTR_PRIVSET:
5506 5514                  size = sizeof (priv_set_t);
5507 5515                  if (bufsize > size)
5508 5516                          bufsize = size;
5509 5517                  if (buf != NULL &&
5510 5518                      copyout(zone->zone_privset, buf, bufsize) != 0)
5511 5519                          error = EFAULT;
5512 5520                  break;
5513 5521          case ZONE_ATTR_UNIQID:
5514 5522                  size = sizeof (zone->zone_uniqid);
5515 5523                  if (bufsize > size)
5516 5524                          bufsize = size;
5517 5525                  if (buf != NULL &&
5518 5526                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5519 5527                          error = EFAULT;
5520 5528                  break;
5521 5529          case ZONE_ATTR_POOLID:
5522 5530                  {
5523 5531                          pool_t *pool;
5524 5532                          poolid_t poolid;
5525 5533  
5526 5534                          if (pool_lock_intr() != 0) {
5527 5535                                  error = EINTR;
5528 5536                                  break;
5529 5537                          }
5530 5538                          pool = zone_pool_get(zone);
5531 5539                          poolid = pool->pool_id;
5532 5540                          pool_unlock();
5533 5541                          size = sizeof (poolid);
5534 5542                          if (bufsize > size)
5535 5543                                  bufsize = size;
5536 5544                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5537 5545                                  error = EFAULT;
5538 5546                  }
5539 5547                  break;
5540 5548          case ZONE_ATTR_SLBL:
5541 5549                  size = sizeof (bslabel_t);
5542 5550                  if (bufsize > size)
5543 5551                          bufsize = size;
5544 5552                  if (zone->zone_slabel == NULL)
5545 5553                          error = EINVAL;
5546 5554                  else if (buf != NULL &&
5547 5555                      copyout(label2bslabel(zone->zone_slabel), buf,
5548 5556                      bufsize) != 0)
5549 5557                          error = EFAULT;
5550 5558                  break;
5551 5559          case ZONE_ATTR_INITPID:
5552 5560                  size = sizeof (initpid);
5553 5561                  if (bufsize > size)
5554 5562                          bufsize = size;
5555 5563                  initpid = zone->zone_proc_initpid;
5556 5564                  if (initpid == -1) {
5557 5565                          error = ESRCH;
5558 5566                          break;
5559 5567                  }
5560 5568                  if (buf != NULL &&
5561 5569                      copyout(&initpid, buf, bufsize) != 0)
5562 5570                          error = EFAULT;
5563 5571                  break;
5564 5572          case ZONE_ATTR_BRAND:
5565 5573                  size = strlen(zone->zone_brand->b_name) + 1;
5566 5574  
5567 5575                  if (bufsize > size)
5568 5576                          bufsize = size;
5569 5577                  if (buf != NULL) {
5570 5578                          err = copyoutstr(zone->zone_brand->b_name, buf,
5571 5579                              bufsize, NULL);
5572 5580                          if (err != 0 && err != ENAMETOOLONG)
5573 5581                                  error = EFAULT;
5574 5582                  }
5575 5583                  break;
5576 5584          case ZONE_ATTR_INITNAME:
5577 5585                  size = strlen(zone->zone_initname) + 1;
5578 5586                  if (bufsize > size)
5579 5587                          bufsize = size;
5580 5588                  if (buf != NULL) {
5581 5589                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5582 5590                              NULL);
5583 5591                          if (err != 0 && err != ENAMETOOLONG)
5584 5592                                  error = EFAULT;
5585 5593                  }
5586 5594                  break;
5587 5595          case ZONE_ATTR_BOOTARGS:
5588 5596                  if (zone->zone_bootargs == NULL)
5589 5597                          outstr = "";
5590 5598                  else
5591 5599                          outstr = zone->zone_bootargs;
5592 5600                  size = strlen(outstr) + 1;
5593 5601                  if (bufsize > size)
5594 5602                          bufsize = size;
5595 5603                  if (buf != NULL) {
5596 5604                          err = copyoutstr(outstr, buf, bufsize, NULL);
5597 5605                          if (err != 0 && err != ENAMETOOLONG)
5598 5606                                  error = EFAULT;
5599 5607                  }
5600 5608                  break;
5601 5609          case ZONE_ATTR_PHYS_MCAP:
5602 5610                  size = sizeof (zone->zone_phys_mcap);
5603 5611                  if (bufsize > size)
5604 5612                          bufsize = size;
5605 5613                  if (buf != NULL &&
5606 5614                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5607 5615                          error = EFAULT;
5608 5616                  break;
5609 5617          case ZONE_ATTR_SCHED_CLASS:
5610 5618                  mutex_enter(&class_lock);
5611 5619  
5612 5620                  if (zone->zone_defaultcid >= loaded_classes)
5613 5621                          outstr = "";
5614 5622                  else
5615 5623                          outstr = sclass[zone->zone_defaultcid].cl_name;
5616 5624                  size = strlen(outstr) + 1;
5617 5625                  if (bufsize > size)
5618 5626                          bufsize = size;
5619 5627                  if (buf != NULL) {
5620 5628                          err = copyoutstr(outstr, buf, bufsize, NULL);
5621 5629                          if (err != 0 && err != ENAMETOOLONG)
5622 5630                                  error = EFAULT;
5623 5631                  }
5624 5632  
5625 5633                  mutex_exit(&class_lock);
5626 5634                  break;
5627 5635          case ZONE_ATTR_HOSTID:
5628 5636                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5629 5637                      bufsize == sizeof (zone->zone_hostid)) {
5630 5638                          size = sizeof (zone->zone_hostid);
5631 5639                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5632 5640                              bufsize) != 0)
5633 5641                                  error = EFAULT;
5634 5642                  } else {
5635 5643                          error = EINVAL;
5636 5644                  }
5637 5645                  break;
5638 5646          case ZONE_ATTR_FS_ALLOWED:
5639 5647                  if (zone->zone_fs_allowed == NULL)
5640 5648                          outstr = "";
5641 5649                  else
5642 5650                          outstr = zone->zone_fs_allowed;
5643 5651                  size = strlen(outstr) + 1;
5644 5652                  if (bufsize > size)
5645 5653                          bufsize = size;
5646 5654                  if (buf != NULL) {
5647 5655                          err = copyoutstr(outstr, buf, bufsize, NULL);
5648 5656                          if (err != 0 && err != ENAMETOOLONG)
5649 5657                                  error = EFAULT;
5650 5658                  }
5651 5659                  break;
5652 5660          case ZONE_ATTR_SECFLAGS:
5653 5661                  size = sizeof (zone->zone_secflags);
5654 5662                  if (bufsize > size)
5655 5663                          bufsize = size;
5656 5664                  if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5657 5665                          error = EFAULT;
5658 5666                  break;
5659 5667          case ZONE_ATTR_NETWORK:
5660 5668                  bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5661 5669                  size = bufsize;
5662 5670                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5663 5671                  if (copyin(buf, zbuf, bufsize) != 0) {
5664 5672                          error = EFAULT;
5665 5673                  } else {
5666 5674                          error = zone_get_network(zoneid, zbuf);
5667 5675                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5668 5676                                  error = EFAULT;
5669 5677                  }
5670 5678                  kmem_free(zbuf, bufsize);
5671 5679                  break;
5672 5680          default:
5673 5681                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5674 5682                          size = bufsize;
5675 5683                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5676 5684                  } else {
5677 5685                          error = EINVAL;
5678 5686                  }
5679 5687          }
5680 5688          zone_rele(zone);
5681 5689  
5682 5690          if (error)
5683 5691                  return (set_errno(error));
5684 5692          return ((ssize_t)size);
5685 5693  }
5686 5694  
5687 5695  /*
5688 5696   * Systemcall entry point for zone_setattr(2).
5689 5697   */
5690 5698  /*ARGSUSED*/
5691 5699  static int
5692 5700  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5693 5701  {
5694 5702          zone_t *zone;
5695 5703          zone_status_t zone_status;
5696 5704          int err = -1;
5697 5705          zone_net_data_t *zbuf;
5698 5706  
5699 5707          if (secpolicy_zone_config(CRED()) != 0)
5700 5708                  return (set_errno(EPERM));
5701 5709  
5702 5710          /*
5703 5711           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5704 5712           * global zone.
5705 5713           */
5706 5714          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5707 5715                  return (set_errno(EINVAL));
5708 5716          }
5709 5717  
5710 5718          mutex_enter(&zonehash_lock);
5711 5719          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5712 5720                  mutex_exit(&zonehash_lock);
5713 5721                  return (set_errno(EINVAL));
5714 5722          }
5715 5723          zone_hold(zone);
5716 5724          mutex_exit(&zonehash_lock);
5717 5725  
5718 5726          /*
5719 5727           * At present most attributes can only be set on non-running,
5720 5728           * non-global zones.
5721 5729           */
5722 5730          zone_status = zone_status_get(zone);
5723 5731          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5724 5732                  err = EINVAL;
5725 5733                  goto done;
5726 5734          }
5727 5735  
5728 5736          switch (attr) {
5729 5737          case ZONE_ATTR_INITNAME:
5730 5738                  err = zone_set_initname(zone, (const char *)buf);
5731 5739                  break;
5732 5740          case ZONE_ATTR_INITNORESTART:
5733 5741                  zone->zone_restart_init = B_FALSE;
5734 5742                  err = 0;
5735 5743                  break;
5736 5744          case ZONE_ATTR_BOOTARGS:
5737 5745                  err = zone_set_bootargs(zone, (const char *)buf);
5738 5746                  break;
5739 5747          case ZONE_ATTR_BRAND:
5740 5748                  err = zone_set_brand(zone, (const char *)buf);
5741 5749                  break;
5742 5750          case ZONE_ATTR_FS_ALLOWED:
5743 5751                  err = zone_set_fs_allowed(zone, (const char *)buf);
5744 5752                  break;
5745 5753          case ZONE_ATTR_SECFLAGS:
5746 5754                  err = zone_set_secflags(zone, (psecflags_t *)buf);
5747 5755                  break;
5748 5756          case ZONE_ATTR_PHYS_MCAP:
5749 5757                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5750 5758                  break;
5751 5759          case ZONE_ATTR_SCHED_CLASS:
5752 5760                  err = zone_set_sched_class(zone, (const char *)buf);
5753 5761                  break;
5754 5762          case ZONE_ATTR_HOSTID:
5755 5763                  if (bufsize == sizeof (zone->zone_hostid)) {
5756 5764                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5757 5765                                  err = 0;
5758 5766                          else
5759 5767                                  err = EFAULT;
5760 5768                  } else {
5761 5769                          err = EINVAL;
5762 5770                  }
5763 5771                  break;
5764 5772          case ZONE_ATTR_NETWORK:
5765 5773                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5766 5774                          err = EINVAL;
5767 5775                          break;
5768 5776                  }
5769 5777                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5770 5778                  if (copyin(buf, zbuf, bufsize) != 0) {
5771 5779                          kmem_free(zbuf, bufsize);
5772 5780                          err = EFAULT;
5773 5781                          break;
5774 5782                  }
5775 5783                  err = zone_set_network(zoneid, zbuf);
5776 5784                  kmem_free(zbuf, bufsize);
5777 5785                  break;
5778 5786          default:
5779 5787                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5780 5788                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5781 5789                  else
5782 5790                          err = EINVAL;
5783 5791          }
5784 5792  
5785 5793  done:
5786 5794          zone_rele(zone);
5787 5795          ASSERT(err != -1);
5788 5796          return (err != 0 ? set_errno(err) : 0);
5789 5797  }
5790 5798  
5791 5799  /*
5792 5800   * Return zero if the process has at least one vnode mapped in to its
5793 5801   * address space which shouldn't be allowed to change zones.
5794 5802   *
5795 5803   * Also return zero if the process has any shared mappings which reserve
5796 5804   * swap.  This is because the counting for zone.max-swap does not allow swap
5797 5805   * reservation to be shared between zones.  zone swap reservation is counted
5798 5806   * on zone->zone_max_swap.
5799 5807   */
5800 5808  static int
5801 5809  as_can_change_zones(void)
5802 5810  {
5803 5811          proc_t *pp = curproc;
5804 5812          struct seg *seg;
5805 5813          struct as *as = pp->p_as;
5806 5814          vnode_t *vp;
5807 5815          int allow = 1;
5808 5816  
5809 5817          ASSERT(pp->p_as != &kas);
5810 5818          AS_LOCK_ENTER(as, RW_READER);
5811 5819          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5812 5820  
5813 5821                  /*
5814 5822                   * Cannot enter zone with shared anon memory which
5815 5823                   * reserves swap.  See comment above.
5816 5824                   */
5817 5825                  if (seg_can_change_zones(seg) == B_FALSE) {
5818 5826                          allow = 0;
5819 5827                          break;
5820 5828                  }
5821 5829                  /*
5822 5830                   * if we can't get a backing vnode for this segment then skip
5823 5831                   * it.
5824 5832                   */
5825 5833                  vp = NULL;
5826 5834                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5827 5835                          continue;
5828 5836                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5829 5837                          allow = 0;
5830 5838                          break;
5831 5839                  }
5832 5840          }
5833 5841          AS_LOCK_EXIT(as);
5834 5842          return (allow);
5835 5843  }
5836 5844  
5837 5845  /*
5838 5846   * Count swap reserved by curproc's address space
5839 5847   */
5840 5848  static size_t
5841 5849  as_swresv(void)
5842 5850  {
5843 5851          proc_t *pp = curproc;
5844 5852          struct seg *seg;
5845 5853          struct as *as = pp->p_as;
5846 5854          size_t swap = 0;
5847 5855  
5848 5856          ASSERT(pp->p_as != &kas);
5849 5857          ASSERT(AS_WRITE_HELD(as));
5850 5858          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5851 5859                  swap += seg_swresv(seg);
5852 5860  
5853 5861          return (swap);
5854 5862  }
5855 5863  
5856 5864  /*
5857 5865   * Systemcall entry point for zone_enter().
5858 5866   *
5859 5867   * The current process is injected into said zone.  In the process
5860 5868   * it will change its project membership, privileges, rootdir/cwd,
5861 5869   * zone-wide rctls, and pool association to match those of the zone.
5862 5870   *
5863 5871   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5864 5872   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5865 5873   * enter a zone that is "ready" or "running".
5866 5874   */
5867 5875  static int
5868 5876  zone_enter(zoneid_t zoneid)
5869 5877  {
5870 5878          zone_t *zone;
5871 5879          vnode_t *vp;
5872 5880          proc_t *pp = curproc;
5873 5881          contract_t *ct;
5874 5882          cont_process_t *ctp;
5875 5883          task_t *tk, *oldtk;
5876 5884          kproject_t *zone_proj0;
5877 5885          cred_t *cr, *newcr;
5878 5886          pool_t *oldpool, *newpool;
5879 5887          sess_t *sp;
5880 5888          uid_t uid;
5881 5889          zone_status_t status;
5882 5890          int err = 0;
5883 5891          rctl_entity_p_t e;
5884 5892          size_t swap;
5885 5893          kthread_id_t t;
5886 5894  
5887 5895          if (secpolicy_zone_config(CRED()) != 0)
5888 5896                  return (set_errno(EPERM));
5889 5897          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5890 5898                  return (set_errno(EINVAL));
5891 5899  
5892 5900          /*
5893 5901           * Stop all lwps so we don't need to hold a lock to look at
5894 5902           * curproc->p_zone.  This needs to happen before we grab any
5895 5903           * locks to avoid deadlock (another lwp in the process could
5896 5904           * be waiting for the held lock).
5897 5905           */
5898 5906          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5899 5907                  return (set_errno(EINTR));
5900 5908  
5901 5909          /*
5902 5910           * Make sure we're not changing zones with files open or mapped in
5903 5911           * to our address space which shouldn't be changing zones.
5904 5912           */
5905 5913          if (!files_can_change_zones()) {
5906 5914                  err = EBADF;
5907 5915                  goto out;
5908 5916          }
5909 5917          if (!as_can_change_zones()) {
5910 5918                  err = EFAULT;
5911 5919                  goto out;
5912 5920          }
5913 5921  
5914 5922          mutex_enter(&zonehash_lock);
5915 5923          if (pp->p_zone != global_zone) {
5916 5924                  mutex_exit(&zonehash_lock);
5917 5925                  err = EINVAL;
5918 5926                  goto out;
5919 5927          }
5920 5928  
5921 5929          zone = zone_find_all_by_id(zoneid);
5922 5930          if (zone == NULL) {
5923 5931                  mutex_exit(&zonehash_lock);
5924 5932                  err = EINVAL;
5925 5933                  goto out;
5926 5934          }
5927 5935  
5928 5936          /*
5929 5937           * To prevent processes in a zone from holding contracts on
5930 5938           * extrazonal resources, and to avoid process contract
5931 5939           * memberships which span zones, contract holders and processes
5932 5940           * which aren't the sole members of their encapsulating process
5933 5941           * contracts are not allowed to zone_enter.
5934 5942           */
5935 5943          ctp = pp->p_ct_process;
5936 5944          ct = &ctp->conp_contract;
5937 5945          mutex_enter(&ct->ct_lock);
5938 5946          mutex_enter(&pp->p_lock);
5939 5947          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5940 5948                  mutex_exit(&pp->p_lock);
5941 5949                  mutex_exit(&ct->ct_lock);
5942 5950                  mutex_exit(&zonehash_lock);
5943 5951                  err = EINVAL;
5944 5952                  goto out;
5945 5953          }
5946 5954  
5947 5955          /*
5948 5956           * Moreover, we don't allow processes whose encapsulating
5949 5957           * process contracts have inherited extrazonal contracts.
5950 5958           * While it would be easier to eliminate all process contracts
5951 5959           * with inherited contracts, we need to be able to give a
5952 5960           * restarted init (or other zone-penetrating process) its
5953 5961           * predecessor's contracts.
5954 5962           */
5955 5963          if (ctp->conp_ninherited != 0) {
5956 5964                  contract_t *next;
5957 5965                  for (next = list_head(&ctp->conp_inherited); next;
5958 5966                      next = list_next(&ctp->conp_inherited, next)) {
5959 5967                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5960 5968                                  mutex_exit(&pp->p_lock);
5961 5969                                  mutex_exit(&ct->ct_lock);
5962 5970                                  mutex_exit(&zonehash_lock);
5963 5971                                  err = EINVAL;
5964 5972                                  goto out;
5965 5973                          }
5966 5974                  }
5967 5975          }
5968 5976  
5969 5977          mutex_exit(&pp->p_lock);
5970 5978          mutex_exit(&ct->ct_lock);
5971 5979  
5972 5980          status = zone_status_get(zone);
5973 5981          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5974 5982                  /*
5975 5983                   * Can't join
5976 5984                   */
5977 5985                  mutex_exit(&zonehash_lock);
5978 5986                  err = EINVAL;
5979 5987                  goto out;
5980 5988          }
5981 5989  
5982 5990          /*
5983 5991           * Make sure new priv set is within the permitted set for caller
5984 5992           */
5985 5993          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5986 5994                  mutex_exit(&zonehash_lock);
5987 5995                  err = EPERM;
5988 5996                  goto out;
5989 5997          }
5990 5998          /*
5991 5999           * We want to momentarily drop zonehash_lock while we optimistically
5992 6000           * bind curproc to the pool it should be running in.  This is safe
5993 6001           * since the zone can't disappear (we have a hold on it).
5994 6002           */
5995 6003          zone_hold(zone);
5996 6004          mutex_exit(&zonehash_lock);
5997 6005  
5998 6006          /*
5999 6007           * Grab pool_lock to keep the pools configuration from changing
6000 6008           * and to stop ourselves from getting rebound to another pool
6001 6009           * until we join the zone.
6002 6010           */
6003 6011          if (pool_lock_intr() != 0) {
6004 6012                  zone_rele(zone);
6005 6013                  err = EINTR;
6006 6014                  goto out;
6007 6015          }
6008 6016          ASSERT(secpolicy_pool(CRED()) == 0);
6009 6017          /*
6010 6018           * Bind ourselves to the pool currently associated with the zone.
6011 6019           */
6012 6020          oldpool = curproc->p_pool;
6013 6021          newpool = zone_pool_get(zone);
6014 6022          if (pool_state == POOL_ENABLED && newpool != oldpool &&
6015 6023              (err = pool_do_bind(newpool, P_PID, P_MYID,
6016 6024              POOL_BIND_ALL)) != 0) {
6017 6025                  pool_unlock();
6018 6026                  zone_rele(zone);
6019 6027                  goto out;
6020 6028          }
6021 6029  
6022 6030          /*
6023 6031           * Grab cpu_lock now; we'll need it later when we call
6024 6032           * task_join().
6025 6033           */
6026 6034          mutex_enter(&cpu_lock);
6027 6035          mutex_enter(&zonehash_lock);
6028 6036          /*
6029 6037           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6030 6038           */
6031 6039          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6032 6040                  /*
6033 6041                   * Can't join anymore.
6034 6042                   */
6035 6043                  mutex_exit(&zonehash_lock);
6036 6044                  mutex_exit(&cpu_lock);
6037 6045                  if (pool_state == POOL_ENABLED &&
6038 6046                      newpool != oldpool)
6039 6047                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
6040 6048                              POOL_BIND_ALL);
6041 6049                  pool_unlock();
6042 6050                  zone_rele(zone);
6043 6051                  err = EINVAL;
6044 6052                  goto out;
6045 6053          }
6046 6054  
6047 6055          /*
6048 6056           * a_lock must be held while transfering locked memory and swap
6049 6057           * reservation from the global zone to the non global zone because
6050 6058           * asynchronous faults on the processes' address space can lock
6051 6059           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6052 6060           * segments respectively.
6053 6061           */
6054 6062          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6055 6063          swap = as_swresv();
6056 6064          mutex_enter(&pp->p_lock);
6057 6065          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6058 6066          /* verify that we do not exceed and task or lwp limits */
6059 6067          mutex_enter(&zone->zone_nlwps_lock);
6060 6068          /* add new lwps to zone and zone's proj0 */
6061 6069          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6062 6070          zone->zone_nlwps += pp->p_lwpcnt;
6063 6071          /* add 1 task to zone's proj0 */
6064 6072          zone_proj0->kpj_ntasks += 1;
6065 6073  
6066 6074          zone_proj0->kpj_nprocs++;
6067 6075          zone->zone_nprocs++;
6068 6076          mutex_exit(&zone->zone_nlwps_lock);
6069 6077  
6070 6078          mutex_enter(&zone->zone_mem_lock);
6071 6079          zone->zone_locked_mem += pp->p_locked_mem;
6072 6080          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6073 6081          zone->zone_max_swap += swap;
6074 6082          mutex_exit(&zone->zone_mem_lock);
6075 6083  
6076 6084          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6077 6085          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6078 6086          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6079 6087  
6080 6088          /* remove lwps and process from proc's old zone and old project */
6081 6089          mutex_enter(&pp->p_zone->zone_nlwps_lock);
6082 6090          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6083 6091          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6084 6092          pp->p_task->tk_proj->kpj_nprocs--;
6085 6093          pp->p_zone->zone_nprocs--;
6086 6094          mutex_exit(&pp->p_zone->zone_nlwps_lock);
6087 6095  
6088 6096          mutex_enter(&pp->p_zone->zone_mem_lock);
6089 6097          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6090 6098          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6091 6099          pp->p_zone->zone_max_swap -= swap;
6092 6100          mutex_exit(&pp->p_zone->zone_mem_lock);
6093 6101  
6094 6102          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6095 6103          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6096 6104          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6097 6105  
6098 6106          pp->p_flag |= SZONETOP;
6099 6107          pp->p_zone = zone;
6100 6108          mutex_exit(&pp->p_lock);
6101 6109          AS_LOCK_EXIT(pp->p_as);
6102 6110  
6103 6111          /*
6104 6112           * Joining the zone cannot fail from now on.
6105 6113           *
6106 6114           * This means that a lot of the following code can be commonized and
6107 6115           * shared with zsched().
6108 6116           */
6109 6117  
6110 6118          /*
6111 6119           * If the process contract fmri was inherited, we need to
6112 6120           * flag this so that any contract status will not leak
6113 6121           * extra zone information, svc_fmri in this case
6114 6122           */
6115 6123          if (ctp->conp_svc_ctid != ct->ct_id) {
6116 6124                  mutex_enter(&ct->ct_lock);
6117 6125                  ctp->conp_svc_zone_enter = ct->ct_id;
6118 6126                  mutex_exit(&ct->ct_lock);
6119 6127          }
6120 6128  
6121 6129          /*
6122 6130           * Reset the encapsulating process contract's zone.
6123 6131           */
6124 6132          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6125 6133          contract_setzuniqid(ct, zone->zone_uniqid);
6126 6134  
6127 6135          /*
6128 6136           * Create a new task and associate the process with the project keyed
6129 6137           * by (projid,zoneid).
6130 6138           *
6131 6139           * We might as well be in project 0; the global zone's projid doesn't
6132 6140           * make much sense in a zone anyhow.
6133 6141           *
6134 6142           * This also increments zone_ntasks, and returns with p_lock held.
6135 6143           */
6136 6144          tk = task_create(0, zone);
6137 6145          oldtk = task_join(tk, 0);
6138 6146          mutex_exit(&cpu_lock);
6139 6147  
6140 6148          /*
6141 6149           * call RCTLOP_SET functions on this proc
6142 6150           */
6143 6151          e.rcep_p.zone = zone;
6144 6152          e.rcep_t = RCENTITY_ZONE;
6145 6153          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6146 6154              RCD_CALLBACK);
6147 6155          mutex_exit(&pp->p_lock);
6148 6156  
6149 6157          /*
6150 6158           * We don't need to hold any of zsched's locks here; not only do we know
6151 6159           * the process and zone aren't going away, we know its session isn't
6152 6160           * changing either.
6153 6161           *
6154 6162           * By joining zsched's session here, we mimic the behavior in the
6155 6163           * global zone of init's sid being the pid of sched.  We extend this
6156 6164           * to all zlogin-like zone_enter()'ing processes as well.
6157 6165           */
6158 6166          mutex_enter(&pidlock);
6159 6167          sp = zone->zone_zsched->p_sessp;
6160 6168          sess_hold(zone->zone_zsched);
6161 6169          mutex_enter(&pp->p_lock);
6162 6170          pgexit(pp);
6163 6171          sess_rele(pp->p_sessp, B_TRUE);
6164 6172          pp->p_sessp = sp;
6165 6173          pgjoin(pp, zone->zone_zsched->p_pidp);
6166 6174  
6167 6175          /*
6168 6176           * If any threads are scheduled to be placed on zone wait queue they
6169 6177           * should abandon the idea since the wait queue is changing.
6170 6178           * We need to be holding pidlock & p_lock to do this.
6171 6179           */
6172 6180          if ((t = pp->p_tlist) != NULL) {
6173 6181                  do {
6174 6182                          thread_lock(t);
6175 6183                          /*
6176 6184                           * Kick this thread so that it doesn't sit
6177 6185                           * on a wrong wait queue.
6178 6186                           */
6179 6187                          if (ISWAITING(t))
6180 6188                                  setrun_locked(t);
6181 6189  
6182 6190                          if (t->t_schedflag & TS_ANYWAITQ)
6183 6191                                  t->t_schedflag &= ~ TS_ANYWAITQ;
6184 6192  
6185 6193                          thread_unlock(t);
6186 6194                  } while ((t = t->t_forw) != pp->p_tlist);
6187 6195          }
6188 6196  
6189 6197          /*
6190 6198           * If there is a default scheduling class for the zone and it is not
6191 6199           * the class we are currently in, change all of the threads in the
6192 6200           * process to the new class.  We need to be holding pidlock & p_lock
6193 6201           * when we call parmsset so this is a good place to do it.
6194 6202           */
6195 6203          if (zone->zone_defaultcid > 0 &&
6196 6204              zone->zone_defaultcid != curthread->t_cid) {
6197 6205                  pcparms_t pcparms;
6198 6206  
6199 6207                  pcparms.pc_cid = zone->zone_defaultcid;
6200 6208                  pcparms.pc_clparms[0] = 0;
6201 6209  
6202 6210                  /*
6203 6211                   * If setting the class fails, we still want to enter the zone.
6204 6212                   */
6205 6213                  if ((t = pp->p_tlist) != NULL) {
6206 6214                          do {
6207 6215                                  (void) parmsset(&pcparms, t);
6208 6216                          } while ((t = t->t_forw) != pp->p_tlist);
6209 6217                  }
6210 6218          }
6211 6219  
6212 6220          mutex_exit(&pp->p_lock);
6213 6221          mutex_exit(&pidlock);
6214 6222  
6215 6223          mutex_exit(&zonehash_lock);
6216 6224          /*
6217 6225           * We're firmly in the zone; let pools progress.
6218 6226           */
6219 6227          pool_unlock();
6220 6228          task_rele(oldtk);
6221 6229          /*
6222 6230           * We don't need to retain a hold on the zone since we already
6223 6231           * incremented zone_ntasks, so the zone isn't going anywhere.
6224 6232           */
6225 6233          zone_rele(zone);
6226 6234  
6227 6235          /*
6228 6236           * Chroot
6229 6237           */
6230 6238          vp = zone->zone_rootvp;
6231 6239          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6232 6240          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6233 6241  
6234 6242          /*
6235 6243           * Change process security flags.  Note that the _effective_ flags
6236 6244           * cannot change
6237 6245           */
6238 6246          secflags_copy(&pp->p_secflags.psf_lower,
6239 6247              &zone->zone_secflags.psf_lower);
6240 6248          secflags_copy(&pp->p_secflags.psf_upper,
6241 6249              &zone->zone_secflags.psf_upper);
6242 6250          secflags_copy(&pp->p_secflags.psf_inherit,
6243 6251              &zone->zone_secflags.psf_inherit);
6244 6252  
6245 6253          /*
6246 6254           * Change process credentials
6247 6255           */
6248 6256          newcr = cralloc();
6249 6257          mutex_enter(&pp->p_crlock);
6250 6258          cr = pp->p_cred;
6251 6259          crcopy_to(cr, newcr);
6252 6260          crsetzone(newcr, zone);
6253 6261          pp->p_cred = newcr;
6254 6262  
6255 6263          /*
6256 6264           * Restrict all process privilege sets to zone limit
6257 6265           */
6258 6266          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6259 6267          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6260 6268          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6261 6269          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6262 6270          mutex_exit(&pp->p_crlock);
6263 6271          crset(pp, newcr);
6264 6272  
6265 6273          /*
6266 6274           * Adjust upcount to reflect zone entry.
6267 6275           */
6268 6276          uid = crgetruid(newcr);
6269 6277          mutex_enter(&pidlock);
6270 6278          upcount_dec(uid, GLOBAL_ZONEID);
6271 6279          upcount_inc(uid, zoneid);
6272 6280          mutex_exit(&pidlock);
6273 6281  
6274 6282          /*
6275 6283           * Set up core file path and content.
6276 6284           */
6277 6285          set_core_defaults();
6278 6286  
6279 6287  out:
6280 6288          /*
6281 6289           * Let the other lwps continue.
6282 6290           */
6283 6291          mutex_enter(&pp->p_lock);
6284 6292          if (curthread != pp->p_agenttp)
6285 6293                  continuelwps(pp);
6286 6294          mutex_exit(&pp->p_lock);
6287 6295  
6288 6296          return (err != 0 ? set_errno(err) : 0);
6289 6297  }
6290 6298  
6291 6299  /*
6292 6300   * Systemcall entry point for zone_list(2).
6293 6301   *
6294 6302   * Processes running in a (non-global) zone only see themselves.
6295 6303   * On labeled systems, they see all zones whose label they dominate.
6296 6304   */
6297 6305  static int
6298 6306  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6299 6307  {
6300 6308          zoneid_t *zoneids;
6301 6309          zone_t *zone, *myzone;
6302 6310          uint_t user_nzones, real_nzones;
6303 6311          uint_t domi_nzones;
6304 6312          int error;
6305 6313  
6306 6314          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6307 6315                  return (set_errno(EFAULT));
6308 6316  
6309 6317          myzone = curproc->p_zone;
6310 6318          if (myzone != global_zone) {
6311 6319                  bslabel_t *mybslab;
6312 6320  
6313 6321                  if (!is_system_labeled()) {
6314 6322                          /* just return current zone */
6315 6323                          real_nzones = domi_nzones = 1;
6316 6324                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6317 6325                          zoneids[0] = myzone->zone_id;
6318 6326                  } else {
6319 6327                          /* return all zones that are dominated */
6320 6328                          mutex_enter(&zonehash_lock);
6321 6329                          real_nzones = zonecount;
6322 6330                          domi_nzones = 0;
6323 6331                          if (real_nzones > 0) {
6324 6332                                  zoneids = kmem_alloc(real_nzones *
6325 6333                                      sizeof (zoneid_t), KM_SLEEP);
6326 6334                                  mybslab = label2bslabel(myzone->zone_slabel);
6327 6335                                  for (zone = list_head(&zone_active);
6328 6336                                      zone != NULL;
6329 6337                                      zone = list_next(&zone_active, zone)) {
6330 6338                                          if (zone->zone_id == GLOBAL_ZONEID)
6331 6339                                                  continue;
6332 6340                                          if (zone != myzone &&
6333 6341                                              (zone->zone_flags & ZF_IS_SCRATCH))
6334 6342                                                  continue;
6335 6343                                          /*
6336 6344                                           * Note that a label always dominates
6337 6345                                           * itself, so myzone is always included
6338 6346                                           * in the list.
6339 6347                                           */
6340 6348                                          if (bldominates(mybslab,
6341 6349                                              label2bslabel(zone->zone_slabel))) {
6342 6350                                                  zoneids[domi_nzones++] =
6343 6351                                                      zone->zone_id;
6344 6352                                          }
6345 6353                                  }
6346 6354                          }
6347 6355                          mutex_exit(&zonehash_lock);
6348 6356                  }
6349 6357          } else {
6350 6358                  mutex_enter(&zonehash_lock);
6351 6359                  real_nzones = zonecount;
6352 6360                  domi_nzones = 0;
6353 6361                  if (real_nzones > 0) {
6354 6362                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6355 6363                              KM_SLEEP);
6356 6364                          for (zone = list_head(&zone_active); zone != NULL;
6357 6365                              zone = list_next(&zone_active, zone))
6358 6366                                  zoneids[domi_nzones++] = zone->zone_id;
6359 6367                          ASSERT(domi_nzones == real_nzones);
6360 6368                  }
6361 6369                  mutex_exit(&zonehash_lock);
6362 6370          }
6363 6371  
6364 6372          /*
6365 6373           * If user has allocated space for fewer entries than we found, then
6366 6374           * return only up to their limit.  Either way, tell them exactly how
6367 6375           * many we found.
6368 6376           */
6369 6377          if (domi_nzones < user_nzones)
6370 6378                  user_nzones = domi_nzones;
6371 6379          error = 0;
6372 6380          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6373 6381                  error = EFAULT;
6374 6382          } else if (zoneidlist != NULL && user_nzones != 0) {
6375 6383                  if (copyout(zoneids, zoneidlist,
6376 6384                      user_nzones * sizeof (zoneid_t)) != 0)
6377 6385                          error = EFAULT;
6378 6386          }
6379 6387  
6380 6388          if (real_nzones > 0)
6381 6389                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6382 6390  
6383 6391          if (error != 0)
6384 6392                  return (set_errno(error));
6385 6393          else
6386 6394                  return (0);
6387 6395  }
6388 6396  
6389 6397  /*
6390 6398   * Systemcall entry point for zone_lookup(2).
6391 6399   *
6392 6400   * Non-global zones are only able to see themselves and (on labeled systems)
6393 6401   * the zones they dominate.
6394 6402   */
6395 6403  static zoneid_t
6396 6404  zone_lookup(const char *zone_name)
6397 6405  {
6398 6406          char *kname;
6399 6407          zone_t *zone;
6400 6408          zoneid_t zoneid;
6401 6409          int err;
6402 6410  
6403 6411          if (zone_name == NULL) {
6404 6412                  /* return caller's zone id */
6405 6413                  return (getzoneid());
6406 6414          }
6407 6415  
6408 6416          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6409 6417          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6410 6418                  kmem_free(kname, ZONENAME_MAX);
6411 6419                  return (set_errno(err));
6412 6420          }
6413 6421  
6414 6422          mutex_enter(&zonehash_lock);
6415 6423          zone = zone_find_all_by_name(kname);
6416 6424          kmem_free(kname, ZONENAME_MAX);
6417 6425          /*
6418 6426           * In a non-global zone, can only lookup global and own name.
6419 6427           * In Trusted Extensions zone label dominance rules apply.
6420 6428           */
6421 6429          if (zone == NULL ||
6422 6430              zone_status_get(zone) < ZONE_IS_READY ||
6423 6431              !zone_list_access(zone)) {
6424 6432                  mutex_exit(&zonehash_lock);
6425 6433                  return (set_errno(EINVAL));
6426 6434          } else {
6427 6435                  zoneid = zone->zone_id;
6428 6436                  mutex_exit(&zonehash_lock);
6429 6437                  return (zoneid);
6430 6438          }
6431 6439  }
6432 6440  
6433 6441  static int
6434 6442  zone_version(int *version_arg)
6435 6443  {
6436 6444          int version = ZONE_SYSCALL_API_VERSION;
6437 6445  
6438 6446          if (copyout(&version, version_arg, sizeof (int)) != 0)
6439 6447                  return (set_errno(EFAULT));
6440 6448          return (0);
6441 6449  }
6442 6450  
6443 6451  /* ARGSUSED */
6444 6452  long
6445 6453  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6446 6454  {
6447 6455          zone_def zs;
6448 6456          int err;
6449 6457  
6450 6458          switch (cmd) {
6451 6459          case ZONE_CREATE:
6452 6460                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6453 6461                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6454 6462                                  return (set_errno(EFAULT));
6455 6463                          }
6456 6464                  } else {
6457 6465  #ifdef _SYSCALL32_IMPL
6458 6466                          zone_def32 zs32;
6459 6467  
6460 6468                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6461 6469                                  return (set_errno(EFAULT));
6462 6470                          }
6463 6471                          zs.zone_name =
6464 6472                              (const char *)(unsigned long)zs32.zone_name;
6465 6473                          zs.zone_root =
6466 6474                              (const char *)(unsigned long)zs32.zone_root;
6467 6475                          zs.zone_privs =
6468 6476                              (const struct priv_set *)
6469 6477                              (unsigned long)zs32.zone_privs;
6470 6478                          zs.zone_privssz = zs32.zone_privssz;
6471 6479                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6472 6480                          zs.rctlbufsz = zs32.rctlbufsz;
6473 6481                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6474 6482                          zs.zfsbufsz = zs32.zfsbufsz;
6475 6483                          zs.extended_error =
6476 6484                              (int *)(unsigned long)zs32.extended_error;
6477 6485                          zs.match = zs32.match;
6478 6486                          zs.doi = zs32.doi;
6479 6487                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6480 6488                          zs.flags = zs32.flags;
6481 6489  #else
6482 6490                          panic("get_udatamodel() returned bogus result\n");
6483 6491  #endif
6484 6492                  }
6485 6493  
6486 6494                  return (zone_create(zs.zone_name, zs.zone_root,
6487 6495                      zs.zone_privs, zs.zone_privssz,
6488 6496                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6489 6497                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6490 6498                      zs.extended_error, zs.match, zs.doi,
6491 6499                      zs.label, zs.flags));
6492 6500          case ZONE_BOOT:
6493 6501                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6494 6502          case ZONE_DESTROY:
6495 6503                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6496 6504          case ZONE_GETATTR:
6497 6505                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6498 6506                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6499 6507          case ZONE_SETATTR:
6500 6508                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6501 6509                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6502 6510          case ZONE_ENTER:
6503 6511                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6504 6512          case ZONE_LIST:
6505 6513                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6506 6514          case ZONE_SHUTDOWN:
6507 6515                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6508 6516          case ZONE_LOOKUP:
6509 6517                  return (zone_lookup((const char *)arg1));
6510 6518          case ZONE_VERSION:
6511 6519                  return (zone_version((int *)arg1));
6512 6520          case ZONE_ADD_DATALINK:
6513 6521                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6514 6522                      (datalink_id_t)(uintptr_t)arg2));
6515 6523          case ZONE_DEL_DATALINK:
6516 6524                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6517 6525                      (datalink_id_t)(uintptr_t)arg2));
6518 6526          case ZONE_CHECK_DATALINK: {
6519 6527                  zoneid_t        zoneid;
6520 6528                  boolean_t       need_copyout;
6521 6529  
6522 6530                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6523 6531                          return (EFAULT);
6524 6532                  need_copyout = (zoneid == ALL_ZONES);
6525 6533                  err = zone_check_datalink(&zoneid,
6526 6534                      (datalink_id_t)(uintptr_t)arg2);
6527 6535                  if (err == 0 && need_copyout) {
6528 6536                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6529 6537                                  err = EFAULT;
6530 6538                  }
6531 6539                  return (err == 0 ? 0 : set_errno(err));
6532 6540          }
6533 6541          case ZONE_LIST_DATALINK:
6534 6542                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6535 6543                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6536 6544          default:
6537 6545                  return (set_errno(EINVAL));
6538 6546          }
6539 6547  }
6540 6548  
6541 6549  struct zarg {
6542 6550          zone_t *zone;
6543 6551          zone_cmd_arg_t arg;
6544 6552  };
6545 6553  
6546 6554  static int
6547 6555  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6548 6556  {
6549 6557          char *buf;
6550 6558          size_t buflen;
6551 6559          int error;
6552 6560  
6553 6561          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6554 6562          buf = kmem_alloc(buflen, KM_SLEEP);
6555 6563          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6556 6564          error = door_ki_open(buf, doorp);
6557 6565          kmem_free(buf, buflen);
6558 6566          return (error);
6559 6567  }
6560 6568  
6561 6569  static void
6562 6570  zone_release_door(door_handle_t *doorp)
6563 6571  {
6564 6572          door_ki_rele(*doorp);
6565 6573          *doorp = NULL;
6566 6574  }
6567 6575  
6568 6576  static void
6569 6577  zone_ki_call_zoneadmd(struct zarg *zargp)
6570 6578  {
6571 6579          door_handle_t door = NULL;
6572 6580          door_arg_t darg, save_arg;
6573 6581          char *zone_name;
6574 6582          size_t zone_namelen;
6575 6583          zoneid_t zoneid;
6576 6584          zone_t *zone;
6577 6585          zone_cmd_arg_t arg;
6578 6586          uint64_t uniqid;
6579 6587          size_t size;
6580 6588          int error;
6581 6589          int retry;
6582 6590  
6583 6591          zone = zargp->zone;
6584 6592          arg = zargp->arg;
6585 6593          kmem_free(zargp, sizeof (*zargp));
6586 6594  
6587 6595          zone_namelen = strlen(zone->zone_name) + 1;
6588 6596          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6589 6597          bcopy(zone->zone_name, zone_name, zone_namelen);
6590 6598          zoneid = zone->zone_id;
6591 6599          uniqid = zone->zone_uniqid;
6592 6600          /*
6593 6601           * zoneadmd may be down, but at least we can empty out the zone.
6594 6602           * We can ignore the return value of zone_empty() since we're called
6595 6603           * from a kernel thread and know we won't be delivered any signals.
6596 6604           */
6597 6605          ASSERT(curproc == &p0);
6598 6606          (void) zone_empty(zone);
6599 6607          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6600 6608          zone_rele(zone);
6601 6609  
6602 6610          size = sizeof (arg);
6603 6611          darg.rbuf = (char *)&arg;
6604 6612          darg.data_ptr = (char *)&arg;
6605 6613          darg.rsize = size;
6606 6614          darg.data_size = size;
6607 6615          darg.desc_ptr = NULL;
6608 6616          darg.desc_num = 0;
6609 6617  
6610 6618          save_arg = darg;
6611 6619          /*
6612 6620           * Since we're not holding a reference to the zone, any number of
6613 6621           * things can go wrong, including the zone disappearing before we get a
6614 6622           * chance to talk to zoneadmd.
6615 6623           */
6616 6624          for (retry = 0; /* forever */; retry++) {
6617 6625                  if (door == NULL &&
6618 6626                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6619 6627                          goto next;
6620 6628                  }
6621 6629                  ASSERT(door != NULL);
6622 6630  
6623 6631                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6624 6632                      SIZE_MAX, 0)) == 0) {
6625 6633                          break;
6626 6634                  }
6627 6635                  switch (error) {
6628 6636                  case EINTR:
6629 6637                          /* FALLTHROUGH */
6630 6638                  case EAGAIN:    /* process may be forking */
6631 6639                          /*
6632 6640                           * Back off for a bit
6633 6641                           */
6634 6642                          break;
6635 6643                  case EBADF:
6636 6644                          zone_release_door(&door);
6637 6645                          if (zone_lookup_door(zone_name, &door) != 0) {
6638 6646                                  /*
6639 6647                                   * zoneadmd may be dead, but it may come back to
6640 6648                                   * life later.
6641 6649                                   */
6642 6650                                  break;
6643 6651                          }
6644 6652                          break;
6645 6653                  default:
6646 6654                          cmn_err(CE_WARN,
6647 6655                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6648 6656                              error);
6649 6657                          goto out;
6650 6658                  }
6651 6659  next:
6652 6660                  /*
6653 6661                   * If this isn't the same zone_t that we originally had in mind,
6654 6662                   * then this is the same as if two kadmin requests come in at
6655 6663                   * the same time: the first one wins.  This means we lose, so we
6656 6664                   * bail.
6657 6665                   */
6658 6666                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6659 6667                          /*
6660 6668                           * Problem is solved.
6661 6669                           */
6662 6670                          break;
6663 6671                  }
6664 6672                  if (zone->zone_uniqid != uniqid) {
6665 6673                          /*
6666 6674                           * zoneid recycled
6667 6675                           */
6668 6676                          zone_rele(zone);
6669 6677                          break;
6670 6678                  }
6671 6679                  /*
6672 6680                   * We could zone_status_timedwait(), but there doesn't seem to
6673 6681                   * be much point in doing that (plus, it would mean that
6674 6682                   * zone_free() isn't called until this thread exits).
6675 6683                   */
6676 6684                  zone_rele(zone);
6677 6685                  delay(hz);
6678 6686                  darg = save_arg;
6679 6687          }
6680 6688  out:
6681 6689          if (door != NULL) {
6682 6690                  zone_release_door(&door);
6683 6691          }
6684 6692          kmem_free(zone_name, zone_namelen);
6685 6693          thread_exit();
6686 6694  }
6687 6695  
6688 6696  /*
6689 6697   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6690 6698   * kadmin().  The caller is a process in the zone.
6691 6699   *
6692 6700   * In order to shutdown the zone, we will hand off control to zoneadmd
6693 6701   * (running in the global zone) via a door.  We do a half-hearted job at
6694 6702   * killing all processes in the zone, create a kernel thread to contact
6695 6703   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6696 6704   * a form of generation number used to let zoneadmd (as well as
6697 6705   * zone_destroy()) know exactly which zone they're re talking about.
6698 6706   */
6699 6707  int
6700 6708  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6701 6709  {
6702 6710          struct zarg *zargp;
6703 6711          zone_cmd_t zcmd;
6704 6712          zone_t *zone;
6705 6713  
6706 6714          zone = curproc->p_zone;
6707 6715          ASSERT(getzoneid() != GLOBAL_ZONEID);
6708 6716  
6709 6717          switch (cmd) {
6710 6718          case A_SHUTDOWN:
6711 6719                  switch (fcn) {
6712 6720                  case AD_HALT:
6713 6721                  case AD_POWEROFF:
6714 6722                          zcmd = Z_HALT;
6715 6723                          break;
6716 6724                  case AD_BOOT:
6717 6725                          zcmd = Z_REBOOT;
6718 6726                          break;
6719 6727                  case AD_IBOOT:
6720 6728                  case AD_SBOOT:
6721 6729                  case AD_SIBOOT:
6722 6730                  case AD_NOSYNC:
6723 6731                          return (ENOTSUP);
6724 6732                  default:
6725 6733                          return (EINVAL);
6726 6734                  }
6727 6735                  break;
6728 6736          case A_REBOOT:
6729 6737                  zcmd = Z_REBOOT;
6730 6738                  break;
6731 6739          case A_FTRACE:
6732 6740          case A_REMOUNT:
6733 6741          case A_FREEZE:
6734 6742          case A_DUMP:
6735 6743          case A_CONFIG:
6736 6744                  return (ENOTSUP);
6737 6745          default:
6738 6746                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6739 6747                  return (EINVAL);
6740 6748          }
6741 6749  
6742 6750          if (secpolicy_zone_admin(credp, B_FALSE))
6743 6751                  return (EPERM);
6744 6752          mutex_enter(&zone_status_lock);
6745 6753  
6746 6754          /*
6747 6755           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6748 6756           * is in the zone.
6749 6757           */
6750 6758          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6751 6759          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6752 6760                  /*
6753 6761                   * This zone is already on its way down.
6754 6762                   */
6755 6763                  mutex_exit(&zone_status_lock);
6756 6764                  return (0);
6757 6765          }
6758 6766          /*
6759 6767           * Prevent future zone_enter()s
6760 6768           */
6761 6769          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6762 6770          mutex_exit(&zone_status_lock);
6763 6771  
6764 6772          /*
6765 6773           * Kill everyone now and call zoneadmd later.
6766 6774           * zone_ki_call_zoneadmd() will do a more thorough job of this
6767 6775           * later.
6768 6776           */
6769 6777          killall(zone->zone_id);
6770 6778          /*
6771 6779           * Now, create the thread to contact zoneadmd and do the rest of the
6772 6780           * work.  This thread can't be created in our zone otherwise
6773 6781           * zone_destroy() would deadlock.
6774 6782           */
6775 6783          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6776 6784          zargp->arg.cmd = zcmd;
6777 6785          zargp->arg.uniqid = zone->zone_uniqid;
6778 6786          zargp->zone = zone;
6779 6787          (void) strcpy(zargp->arg.locale, "C");
6780 6788          /* mdep was already copied in for us by uadmin */
6781 6789          if (mdep != NULL)
6782 6790                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6783 6791                      sizeof (zargp->arg.bootbuf));
6784 6792          zone_hold(zone);
6785 6793  
6786 6794          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6787 6795              TS_RUN, minclsyspri);
6788 6796          exit(CLD_EXITED, 0);
6789 6797  
6790 6798          return (EINVAL);
6791 6799  }
6792 6800  
6793 6801  /*
6794 6802   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6795 6803   * status to ZONE_IS_SHUTTING_DOWN.
6796 6804   *
6797 6805   * This function also shuts down all running zones to ensure that they won't
6798 6806   * fork new processes.
6799 6807   */
6800 6808  void
6801 6809  zone_shutdown_global(void)
6802 6810  {
6803 6811          zone_t *current_zonep;
6804 6812  
6805 6813          ASSERT(INGLOBALZONE(curproc));
6806 6814          mutex_enter(&zonehash_lock);
6807 6815          mutex_enter(&zone_status_lock);
6808 6816  
6809 6817          /* Modify the global zone's status first. */
6810 6818          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6811 6819          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6812 6820  
6813 6821          /*
6814 6822           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6815 6823           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6816 6824           * could cause assertions to fail (e.g., assertions about a zone's
6817 6825           * state during initialization, readying, or booting) or produce races.
6818 6826           * We'll let threads continue to initialize and ready new zones: they'll
6819 6827           * fail to boot the new zones when they see that the global zone is
6820 6828           * shutting down.
6821 6829           */
6822 6830          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6823 6831              current_zonep = list_next(&zone_active, current_zonep)) {
6824 6832                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6825 6833                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6826 6834          }
6827 6835          mutex_exit(&zone_status_lock);
6828 6836          mutex_exit(&zonehash_lock);
6829 6837  }
6830 6838  
6831 6839  /*
6832 6840   * Returns true if the named dataset is visible in the current zone.
6833 6841   * The 'write' parameter is set to 1 if the dataset is also writable.
6834 6842   */
6835 6843  int
6836 6844  zone_dataset_visible(const char *dataset, int *write)
6837 6845  {
6838 6846          static int zfstype = -1;
6839 6847          zone_dataset_t *zd;
6840 6848          size_t len;
6841 6849          zone_t *zone = curproc->p_zone;
6842 6850          const char *name = NULL;
6843 6851          vfs_t *vfsp = NULL;
6844 6852  
6845 6853          if (dataset[0] == '\0')
6846 6854                  return (0);
6847 6855  
6848 6856          /*
6849 6857           * Walk the list once, looking for datasets which match exactly, or
6850 6858           * specify a dataset underneath an exported dataset.  If found, return
6851 6859           * true and note that it is writable.
6852 6860           */
6853 6861          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6854 6862              zd = list_next(&zone->zone_datasets, zd)) {
6855 6863  
6856 6864                  len = strlen(zd->zd_dataset);
6857 6865                  if (strlen(dataset) >= len &&
6858 6866                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6859 6867                      (dataset[len] == '\0' || dataset[len] == '/' ||
6860 6868                      dataset[len] == '@')) {
6861 6869                          if (write)
6862 6870                                  *write = 1;
6863 6871                          return (1);
6864 6872                  }
6865 6873          }
6866 6874  
6867 6875          /*
6868 6876           * Walk the list a second time, searching for datasets which are parents
6869 6877           * of exported datasets.  These should be visible, but read-only.
6870 6878           *
6871 6879           * Note that we also have to support forms such as 'pool/dataset/', with
6872 6880           * a trailing slash.
6873 6881           */
6874 6882          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6875 6883              zd = list_next(&zone->zone_datasets, zd)) {
6876 6884  
6877 6885                  len = strlen(dataset);
6878 6886                  if (dataset[len - 1] == '/')
6879 6887                          len--;  /* Ignore trailing slash */
6880 6888                  if (len < strlen(zd->zd_dataset) &&
6881 6889                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6882 6890                      zd->zd_dataset[len] == '/') {
6883 6891                          if (write)
6884 6892                                  *write = 0;
6885 6893                          return (1);
6886 6894                  }
6887 6895          }
6888 6896  
6889 6897          /*
6890 6898           * We reach here if the given dataset is not found in the zone_dataset
6891 6899           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6892 6900           * instead of delegation. For this we search for the dataset in the
6893 6901           * zone_vfslist of this zone. If found, return true and note that it is
6894 6902           * not writable.
6895 6903           */
6896 6904  
6897 6905          /*
6898 6906           * Initialize zfstype if it is not initialized yet.
6899 6907           */
6900 6908          if (zfstype == -1) {
6901 6909                  struct vfssw *vswp = vfs_getvfssw("zfs");
6902 6910                  zfstype = vswp - vfssw;
6903 6911                  vfs_unrefvfssw(vswp);
6904 6912          }
6905 6913  
6906 6914          vfs_list_read_lock();
6907 6915          vfsp = zone->zone_vfslist;
6908 6916          do {
6909 6917                  ASSERT(vfsp);
6910 6918                  if (vfsp->vfs_fstype == zfstype) {
6911 6919                          name = refstr_value(vfsp->vfs_resource);
6912 6920  
6913 6921                          /*
6914 6922                           * Check if we have an exact match.
6915 6923                           */
6916 6924                          if (strcmp(dataset, name) == 0) {
6917 6925                                  vfs_list_unlock();
6918 6926                                  if (write)
6919 6927                                          *write = 0;
6920 6928                                  return (1);
6921 6929                          }
6922 6930                          /*
6923 6931                           * We need to check if we are looking for parents of
6924 6932                           * a dataset. These should be visible, but read-only.
6925 6933                           */
6926 6934                          len = strlen(dataset);
6927 6935                          if (dataset[len - 1] == '/')
6928 6936                                  len--;
6929 6937  
6930 6938                          if (len < strlen(name) &&
6931 6939                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6932 6940                                  vfs_list_unlock();
6933 6941                                  if (write)
6934 6942                                          *write = 0;
6935 6943                                  return (1);
6936 6944                          }
6937 6945                  }
6938 6946                  vfsp = vfsp->vfs_zone_next;
6939 6947          } while (vfsp != zone->zone_vfslist);
6940 6948  
6941 6949          vfs_list_unlock();
6942 6950          return (0);
6943 6951  }
6944 6952  
6945 6953  /*
6946 6954   * zone_find_by_any_path() -
6947 6955   *
6948 6956   * kernel-private routine similar to zone_find_by_path(), but which
6949 6957   * effectively compares against zone paths rather than zonerootpath
6950 6958   * (i.e., the last component of zonerootpaths, which should be "root/",
6951 6959   * are not compared.)  This is done in order to accurately identify all
6952 6960   * paths, whether zone-visible or not, including those which are parallel
6953 6961   * to /root/, such as /dev/, /home/, etc...
6954 6962   *
6955 6963   * If the specified path does not fall under any zone path then global
6956 6964   * zone is returned.
6957 6965   *
6958 6966   * The treat_abs parameter indicates whether the path should be treated as
6959 6967   * an absolute path although it does not begin with "/".  (This supports
6960 6968   * nfs mount syntax such as host:any/path.)
6961 6969   *
6962 6970   * The caller is responsible for zone_rele of the returned zone.
6963 6971   */
6964 6972  zone_t *
6965 6973  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6966 6974  {
6967 6975          zone_t *zone;
6968 6976          int path_offset = 0;
6969 6977  
6970 6978          if (path == NULL) {
6971 6979                  zone_hold(global_zone);
6972 6980                  return (global_zone);
6973 6981          }
6974 6982  
6975 6983          if (*path != '/') {
6976 6984                  ASSERT(treat_abs);
6977 6985                  path_offset = 1;
6978 6986          }
6979 6987  
6980 6988          mutex_enter(&zonehash_lock);
6981 6989          for (zone = list_head(&zone_active); zone != NULL;
6982 6990              zone = list_next(&zone_active, zone)) {
6983 6991                  char    *c;
6984 6992                  size_t  pathlen;
6985 6993                  char *rootpath_start;
6986 6994  
6987 6995                  if (zone == global_zone)        /* skip global zone */
6988 6996                          continue;
6989 6997  
6990 6998                  /* scan backwards to find start of last component */
6991 6999                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6992 7000                  do {
6993 7001                          c--;
6994 7002                  } while (*c != '/');
6995 7003  
6996 7004                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
6997 7005                  rootpath_start = (zone->zone_rootpath + path_offset);
6998 7006                  if (strncmp(path, rootpath_start, pathlen) == 0)
6999 7007                          break;
7000 7008          }
7001 7009          if (zone == NULL)
7002 7010                  zone = global_zone;
7003 7011          zone_hold(zone);
7004 7012          mutex_exit(&zonehash_lock);
7005 7013          return (zone);
7006 7014  }
7007 7015  
7008 7016  /*
7009 7017   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7010 7018   * zone_dl_t pointer if found, and NULL otherwise.
7011 7019   */
7012 7020  static zone_dl_t *
7013 7021  zone_find_dl(zone_t *zone, datalink_id_t linkid)
7014 7022  {
7015 7023          zone_dl_t *zdl;
7016 7024  
7017 7025          ASSERT(mutex_owned(&zone->zone_lock));
7018 7026          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7019 7027              zdl = list_next(&zone->zone_dl_list, zdl)) {
7020 7028                  if (zdl->zdl_id == linkid)
7021 7029                          break;
7022 7030          }
7023 7031          return (zdl);
7024 7032  }
7025 7033  
7026 7034  static boolean_t
7027 7035  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7028 7036  {
7029 7037          boolean_t exists;
7030 7038  
7031 7039          mutex_enter(&zone->zone_lock);
7032 7040          exists = (zone_find_dl(zone, linkid) != NULL);
7033 7041          mutex_exit(&zone->zone_lock);
7034 7042          return (exists);
7035 7043  }
7036 7044  
7037 7045  /*
7038 7046   * Add an data link name for the zone.
7039 7047   */
7040 7048  static int
7041 7049  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7042 7050  {
7043 7051          zone_dl_t *zdl;
7044 7052          zone_t *zone;
7045 7053          zone_t *thiszone;
7046 7054  
7047 7055          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7048 7056                  return (set_errno(ENXIO));
7049 7057  
7050 7058          /* Verify that the datalink ID doesn't already belong to a zone. */
7051 7059          mutex_enter(&zonehash_lock);
7052 7060          for (zone = list_head(&zone_active); zone != NULL;
7053 7061              zone = list_next(&zone_active, zone)) {
7054 7062                  if (zone_dl_exists(zone, linkid)) {
7055 7063                          mutex_exit(&zonehash_lock);
7056 7064                          zone_rele(thiszone);
7057 7065                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7058 7066                  }
7059 7067          }
7060 7068  
7061 7069          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7062 7070          zdl->zdl_id = linkid;
7063 7071          zdl->zdl_net = NULL;
7064 7072          mutex_enter(&thiszone->zone_lock);
7065 7073          list_insert_head(&thiszone->zone_dl_list, zdl);
7066 7074          mutex_exit(&thiszone->zone_lock);
7067 7075          mutex_exit(&zonehash_lock);
7068 7076          zone_rele(thiszone);
7069 7077          return (0);
7070 7078  }
7071 7079  
7072 7080  static int
7073 7081  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7074 7082  {
7075 7083          zone_dl_t *zdl;
7076 7084          zone_t *zone;
7077 7085          int err = 0;
7078 7086  
7079 7087          if ((zone = zone_find_by_id(zoneid)) == NULL)
7080 7088                  return (set_errno(EINVAL));
7081 7089  
7082 7090          mutex_enter(&zone->zone_lock);
7083 7091          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7084 7092                  err = ENXIO;
7085 7093          } else {
7086 7094                  list_remove(&zone->zone_dl_list, zdl);
7087 7095                  nvlist_free(zdl->zdl_net);
7088 7096                  kmem_free(zdl, sizeof (zone_dl_t));
7089 7097          }
7090 7098          mutex_exit(&zone->zone_lock);
7091 7099          zone_rele(zone);
7092 7100          return (err == 0 ? 0 : set_errno(err));
7093 7101  }
7094 7102  
7095 7103  /*
7096 7104   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7097 7105   * the linkid.  Otherwise we just check if the specified zoneidp has been
7098 7106   * assigned the supplied linkid.
7099 7107   */
7100 7108  int
7101 7109  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7102 7110  {
7103 7111          zone_t *zone;
7104 7112          int err = ENXIO;
7105 7113  
7106 7114          if (*zoneidp != ALL_ZONES) {
7107 7115                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7108 7116                          if (zone_dl_exists(zone, linkid))
7109 7117                                  err = 0;
7110 7118                          zone_rele(zone);
7111 7119                  }
7112 7120                  return (err);
7113 7121          }
7114 7122  
7115 7123          mutex_enter(&zonehash_lock);
7116 7124          for (zone = list_head(&zone_active); zone != NULL;
7117 7125              zone = list_next(&zone_active, zone)) {
7118 7126                  if (zone_dl_exists(zone, linkid)) {
7119 7127                          *zoneidp = zone->zone_id;
7120 7128                          err = 0;
7121 7129                          break;
7122 7130                  }
7123 7131          }
7124 7132          mutex_exit(&zonehash_lock);
7125 7133          return (err);
7126 7134  }
7127 7135  
7128 7136  /*
7129 7137   * Get the list of datalink IDs assigned to a zone.
7130 7138   *
7131 7139   * On input, *nump is the number of datalink IDs that can fit in the supplied
7132 7140   * idarray.  Upon return, *nump is either set to the number of datalink IDs
7133 7141   * that were placed in the array if the array was large enough, or to the
7134 7142   * number of datalink IDs that the function needs to place in the array if the
7135 7143   * array is too small.
7136 7144   */
7137 7145  static int
7138 7146  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7139 7147  {
7140 7148          uint_t num, dlcount;
7141 7149          zone_t *zone;
7142 7150          zone_dl_t *zdl;
7143 7151          datalink_id_t *idptr = idarray;
7144 7152  
7145 7153          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7146 7154                  return (set_errno(EFAULT));
7147 7155          if ((zone = zone_find_by_id(zoneid)) == NULL)
7148 7156                  return (set_errno(ENXIO));
7149 7157  
7150 7158          num = 0;
7151 7159          mutex_enter(&zone->zone_lock);
7152 7160          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7153 7161              zdl = list_next(&zone->zone_dl_list, zdl)) {
7154 7162                  /*
7155 7163                   * If the list is bigger than what the caller supplied, just
7156 7164                   * count, don't do copyout.
7157 7165                   */
7158 7166                  if (++num > dlcount)
7159 7167                          continue;
7160 7168                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7161 7169                          mutex_exit(&zone->zone_lock);
7162 7170                          zone_rele(zone);
7163 7171                          return (set_errno(EFAULT));
7164 7172                  }
7165 7173                  idptr++;
7166 7174          }
7167 7175          mutex_exit(&zone->zone_lock);
7168 7176          zone_rele(zone);
7169 7177  
7170 7178          /* Increased or decreased, caller should be notified. */
7171 7179          if (num != dlcount) {
7172 7180                  if (copyout(&num, nump, sizeof (num)) != 0)
7173 7181                          return (set_errno(EFAULT));
7174 7182          }
7175 7183          return (0);
7176 7184  }
7177 7185  
7178 7186  /*
7179 7187   * Public interface for looking up a zone by zoneid. It's a customized version
7180 7188   * for netstack_zone_create(). It can only be called from the zsd create
7181 7189   * callbacks, since it doesn't have reference on the zone structure hence if
7182 7190   * it is called elsewhere the zone could disappear after the zonehash_lock
7183 7191   * is dropped.
7184 7192   *
7185 7193   * Furthermore it
7186 7194   * 1. Doesn't check the status of the zone.
7187 7195   * 2. It will be called even before zone_init is called, in that case the
7188 7196   *    address of zone0 is returned directly, and netstack_zone_create()
7189 7197   *    will only assign a value to zone0.zone_netstack, won't break anything.
7190 7198   * 3. Returns without the zone being held.
7191 7199   */
7192 7200  zone_t *
7193 7201  zone_find_by_id_nolock(zoneid_t zoneid)
7194 7202  {
7195 7203          zone_t *zone;
7196 7204  
7197 7205          mutex_enter(&zonehash_lock);
7198 7206          if (zonehashbyid == NULL)
7199 7207                  zone = &zone0;
7200 7208          else
7201 7209                  zone = zone_find_all_by_id(zoneid);
7202 7210          mutex_exit(&zonehash_lock);
7203 7211          return (zone);
7204 7212  }
7205 7213  
7206 7214  /*
7207 7215   * Walk the datalinks for a given zone
7208 7216   */
7209 7217  int
7210 7218  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7211 7219      void *data)
7212 7220  {
7213 7221          zone_t          *zone;
7214 7222          zone_dl_t       *zdl;
7215 7223          datalink_id_t   *idarray;
7216 7224          uint_t          idcount = 0;
7217 7225          int             i, ret = 0;
7218 7226  
7219 7227          if ((zone = zone_find_by_id(zoneid)) == NULL)
7220 7228                  return (ENOENT);
7221 7229  
7222 7230          /*
7223 7231           * We first build an array of linkid's so that we can walk these and
7224 7232           * execute the callback with the zone_lock dropped.
7225 7233           */
7226 7234          mutex_enter(&zone->zone_lock);
7227 7235          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7228 7236              zdl = list_next(&zone->zone_dl_list, zdl)) {
7229 7237                  idcount++;
7230 7238          }
7231 7239  
7232 7240          if (idcount == 0) {
7233 7241                  mutex_exit(&zone->zone_lock);
7234 7242                  zone_rele(zone);
7235 7243                  return (0);
7236 7244          }
7237 7245  
7238 7246          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7239 7247          if (idarray == NULL) {
7240 7248                  mutex_exit(&zone->zone_lock);
7241 7249                  zone_rele(zone);
7242 7250                  return (ENOMEM);
7243 7251          }
7244 7252  
7245 7253          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7246 7254              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7247 7255                  idarray[i] = zdl->zdl_id;
7248 7256          }
7249 7257  
7250 7258          mutex_exit(&zone->zone_lock);
7251 7259  
7252 7260          for (i = 0; i < idcount && ret == 0; i++) {
7253 7261                  if ((ret = (*cb)(idarray[i], data)) != 0)
7254 7262                          break;
7255 7263          }
7256 7264  
7257 7265          zone_rele(zone);
7258 7266          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7259 7267          return (ret);
7260 7268  }
7261 7269  
7262 7270  static char *
7263 7271  zone_net_type2name(int type)
7264 7272  {
7265 7273          switch (type) {
7266 7274          case ZONE_NETWORK_ADDRESS:
7267 7275                  return (ZONE_NET_ADDRNAME);
7268 7276          case ZONE_NETWORK_DEFROUTER:
7269 7277                  return (ZONE_NET_RTRNAME);
7270 7278          default:
7271 7279                  return (NULL);
7272 7280          }
7273 7281  }
7274 7282  
7275 7283  static int
7276 7284  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7277 7285  {
7278 7286          zone_t *zone;
7279 7287          zone_dl_t *zdl;
7280 7288          nvlist_t *nvl;
7281 7289          int err = 0;
7282 7290          uint8_t *new = NULL;
7283 7291          char *nvname;
7284 7292          int bufsize;
7285 7293          datalink_id_t linkid = znbuf->zn_linkid;
7286 7294  
7287 7295          if (secpolicy_zone_config(CRED()) != 0)
7288 7296                  return (set_errno(EPERM));
7289 7297  
7290 7298          if (zoneid == GLOBAL_ZONEID)
7291 7299                  return (set_errno(EINVAL));
7292 7300  
7293 7301          nvname = zone_net_type2name(znbuf->zn_type);
7294 7302          bufsize = znbuf->zn_len;
7295 7303          new = znbuf->zn_val;
7296 7304          if (nvname == NULL)
7297 7305                  return (set_errno(EINVAL));
7298 7306  
7299 7307          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7300 7308                  return (set_errno(EINVAL));
7301 7309          }
7302 7310  
7303 7311          mutex_enter(&zone->zone_lock);
7304 7312          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7305 7313                  err = ENXIO;
7306 7314                  goto done;
7307 7315          }
7308 7316          if ((nvl = zdl->zdl_net) == NULL) {
7309 7317                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7310 7318                          err = ENOMEM;
7311 7319                          goto done;
7312 7320                  } else {
7313 7321                          zdl->zdl_net = nvl;
7314 7322                  }
7315 7323          }
7316 7324          if (nvlist_exists(nvl, nvname)) {
7317 7325                  err = EINVAL;
7318 7326                  goto done;
7319 7327          }
7320 7328          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7321 7329          ASSERT(err == 0);
7322 7330  done:
7323 7331          mutex_exit(&zone->zone_lock);
7324 7332          zone_rele(zone);
7325 7333          if (err != 0)
7326 7334                  return (set_errno(err));
7327 7335          else
7328 7336                  return (0);
7329 7337  }
7330 7338  
7331 7339  static int
7332 7340  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7333 7341  {
7334 7342          zone_t *zone;
7335 7343          zone_dl_t *zdl;
7336 7344          nvlist_t *nvl;
7337 7345          uint8_t *ptr;
7338 7346          uint_t psize;
7339 7347          int err = 0;
7340 7348          char *nvname;
7341 7349          int bufsize;
7342 7350          void *buf;
7343 7351          datalink_id_t linkid = znbuf->zn_linkid;
7344 7352  
7345 7353          if (zoneid == GLOBAL_ZONEID)
7346 7354                  return (set_errno(EINVAL));
7347 7355  
7348 7356          nvname = zone_net_type2name(znbuf->zn_type);
7349 7357          bufsize = znbuf->zn_len;
7350 7358          buf = znbuf->zn_val;
7351 7359  
7352 7360          if (nvname == NULL)
7353 7361                  return (set_errno(EINVAL));
7354 7362          if ((zone = zone_find_by_id(zoneid)) == NULL)
7355 7363                  return (set_errno(EINVAL));
7356 7364  
7357 7365          mutex_enter(&zone->zone_lock);
7358 7366          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7359 7367                  err = ENXIO;
7360 7368                  goto done;
7361 7369          }
7362 7370          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7363 7371                  err = ENOENT;
7364 7372                  goto done;
7365 7373          }
7366 7374          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7367 7375          ASSERT(err == 0);
7368 7376  
7369 7377          if (psize > bufsize) {
7370 7378                  err = ENOBUFS;
7371 7379                  goto done;
7372 7380          }
7373 7381          znbuf->zn_len = psize;
7374 7382          bcopy(ptr, buf, psize);
7375 7383  done:
7376 7384          mutex_exit(&zone->zone_lock);
7377 7385          zone_rele(zone);
7378 7386          if (err != 0)
7379 7387                  return (set_errno(err));
7380 7388          else
7381 7389                  return (0);
7382 7390  }

↓ open down ↓

2826 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX