2 Wdiff usr/src/uts/common/os/zone.c

Print this page

OS-208 DTrace needs to use zone_did to match zone-limited enablings
OS-192 zone_create() warning on headnode

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015, Joyent Inc. All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27   27   */
  28   28  
  29   29  /*
  30   30   * Zones
  31   31   *
  32   32   *   A zone is a named collection of processes, namespace constraints,
  33   33   *   and other system resources which comprise a secure and manageable
  34   34   *   application containment facility.
  35   35   *
  36   36   *   Zones (represented by the reference counted zone_t) are tracked in
  37   37   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38   38   *   (zoneid_t) are used to track zone association.  Zone IDs are
  39   39   *   dynamically generated when the zone is created; if a persistent
  40   40   *   identifier is needed (core files, accounting logs, audit trail,
  41   41   *   etc.), the zone name should be used.
  42   42   *
  43   43   *
  44   44   *   Global Zone:
  45   45   *
  46   46   *   The global zone (zoneid 0) is automatically associated with all
  47   47   *   system resources that have not been bound to a user-created zone.
  48   48   *   This means that even systems where zones are not in active use
  49   49   *   have a global zone, and all processes, mounts, etc. are
  50   50   *   associated with that zone.  The global zone is generally
  51   51   *   unconstrained in terms of privileges and access, though the usual
  52   52   *   credential and privilege based restrictions apply.
  53   53   *
  54   54   *
  55   55   *   Zone States:
  56   56   *
  57   57   *   The states in which a zone may be in and the transitions are as
  58   58   *   follows:
  59   59   *
  60   60   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61   61   *   initialized zone is added to the list of active zones on the system but
  62   62   *   isn't accessible.
  63   63   *
  64   64   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65   65   *   not yet completed. Not possible to enter the zone, but attributes can
  66   66   *   be retrieved.
  67   67   *
  68   68   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69   69   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70   70   *   executed.  A zone remains in this state until it transitions into
  71   71   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72   72   *
  73   73   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74   74   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75   75   *   state.
  76   76   *
  77   77   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78   78   *   successfully started init.   A zone remains in this state until
  79   79   *   zone_shutdown() is called.
  80   80   *
  81   81   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82   82   *   killing all processes running in the zone. The zone remains
  83   83   *   in this state until there are no more user processes running in the zone.
  84   84   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85   85   *   Since zone_shutdown() is restartable, it may be called successfully
  86   86   *   multiple times for the same zone_t.  Setting of the zone's state to
  87   87   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88   88   *   the zone's status without worrying about it being a moving target.
  89   89   *
  90   90   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91   91   *   are no more user processes in the zone.  The zone remains in this
  92   92   *   state until there are no more kernel threads associated with the
  93   93   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94   94   *   fail.
  95   95   *
  96   96   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97   97   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98   98   *   join the zone or create kernel threads therein.
  99   99   *
 100  100   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  101   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  102   *   return NULL from now on.
 103  103   *
 104  104   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  105   *   processes or threads doing work on behalf of the zone.  The zone is
 106  106   *   removed from the list of active zones.  zone_destroy() returns, and
 107  107   *   the zone can be recreated.
 108  108   *
 109  109   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  110   *   callbacks are executed, and all memory associated with the zone is
 111  111   *   freed.
 112  112   *
 113  113   *   Threads can wait for the zone to enter a requested state by using
 114  114   *   zone_status_wait() or zone_status_timedwait() with the desired
 115  115   *   state passed in as an argument.  Zone state transitions are
 116  116   *   uni-directional; it is not possible to move back to an earlier state.
 117  117   *
 118  118   *
 119  119   *   Zone-Specific Data:
 120  120   *
 121  121   *   Subsystems needing to maintain zone-specific data can store that
 122  122   *   data using the ZSD mechanism.  This provides a zone-specific data
 123  123   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  124   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  125   *   to register callbacks to be invoked when a zone is created, shut
 126  126   *   down, or destroyed.  This can be used to initialize zone-specific
 127  127   *   data for new zones and to clean up when zones go away.
 128  128   *
 129  129   *
 130  130   *   Data Structures:
 131  131   *
 132  132   *   The per-zone structure (zone_t) is reference counted, and freed
 133  133   *   when all references are released.  zone_hold and zone_rele can be
 134  134   *   used to adjust the reference count.  In addition, reference counts
 135  135   *   associated with the cred_t structure are tracked separately using
 136  136   *   zone_cred_hold and zone_cred_rele.
 137  137   *
 138  138   *   Pointers to active zone_t's are stored in two hash tables; one
 139  139   *   for searching by id, the other for searching by name.  Lookups
 140  140   *   can be performed on either basis, using zone_find_by_id and
 141  141   *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  142   *   held, so zone_rele should be called when the pointer is no longer
 143  143   *   needed.  Zones can also be searched by path; zone_find_by_path
 144  144   *   returns the zone with which a path name is associated (global
 145  145   *   zone if the path is not within some other zone's file system
 146  146   *   hierarchy).  This currently requires iterating through each zone,
 147  147   *   so it is slower than an id or name search via a hash table.
 148  148   *
 149  149   *
 150  150   *   Locking:
 151  151   *
 152  152   *   zonehash_lock: This is a top-level global lock used to protect the
 153  153   *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  154   *       while this lock is held.
 155  155   *   zone_status_lock: This is a global lock protecting zone state.
 156  156   *       Zones cannot change state while this lock is held.  It also
 157  157   *       protects the list of kernel threads associated with a zone.
 158  158   *   zone_lock: This is a per-zone lock used to protect several fields of
 159  159   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  160   *       this lock means that the zone cannot go away.
 161  161   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-lwps rctl.
 163  163   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  164   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  165   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  166   *       currently just max_lofi
 167  167   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  168   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  169   *       list (a list of zones in the ZONE_IS_DEAD state).
 170  170   *
 171  171   *   Ordering requirements:
 172  172   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  173   *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  174   *
 175  175   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  176   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  177   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  178   *
 179  179   *   Blocking memory allocations are permitted while holding any of the
 180  180   *   zone locks.
 181  181   *
 182  182   *
 183  183   *   System Call Interface:
 184  184   *
 185  185   *   The zone subsystem can be managed and queried from user level with
 186  186   *   the following system calls (all subcodes of the primary "zone"
 187  187   *   system call):
 188  188   *   - zone_create: creates a zone with selected attributes (name,
 189  189   *     root path, privileges, resource controls, ZFS datasets)
 190  190   *   - zone_enter: allows the current process to enter a zone
 191  191   *   - zone_getattr: reports attributes of a zone
 192  192   *   - zone_setattr: set attributes of a zone
 193  193   *   - zone_boot: set 'init' running for the zone
 194  194   *   - zone_list: lists all zones active in the system
 195  195   *   - zone_lookup: looks up zone id based on name
 196  196   *   - zone_shutdown: initiates shutdown process (see states above)
 197  197   *   - zone_destroy: completes shutdown process (see states above)
 198  198   *
 199  199   */
 200  200  
 201  201  #include <sys/priv_impl.h>
 202  202  #include <sys/cred.h>
 203  203  #include <c2/audit.h>
 204  204  #include <sys/debug.h>
 205  205  #include <sys/file.h>
 206  206  #include <sys/kmem.h>
 207  207  #include <sys/kstat.h>
 208  208  #include <sys/mutex.h>
 209  209  #include <sys/note.h>
 210  210  #include <sys/pathname.h>
 211  211  #include <sys/proc.h>
 212  212  #include <sys/project.h>
 213  213  #include <sys/sysevent.h>
 214  214  #include <sys/task.h>
 215  215  #include <sys/systm.h>
 216  216  #include <sys/types.h>
 217  217  #include <sys/utsname.h>
 218  218  #include <sys/vnode.h>
 219  219  #include <sys/vfs.h>
 220  220  #include <sys/systeminfo.h>
 221  221  #include <sys/policy.h>
 222  222  #include <sys/cred_impl.h>
 223  223  #include <sys/contract_impl.h>
 224  224  #include <sys/contract/process_impl.h>
 225  225  #include <sys/class.h>
 226  226  #include <sys/pool.h>
 227  227  #include <sys/pool_pset.h>
 228  228  #include <sys/pset.h>
 229  229  #include <sys/strlog.h>
 230  230  #include <sys/sysmacros.h>
 231  231  #include <sys/callb.h>
 232  232  #include <sys/vmparam.h>
 233  233  #include <sys/corectl.h>
 234  234  #include <sys/ipc_impl.h>
 235  235  #include <sys/klpd.h>
 236  236  
 237  237  #include <sys/door.h>
 238  238  #include <sys/cpuvar.h>
 239  239  #include <sys/sdt.h>
 240  240  
 241  241  #include <sys/uadmin.h>
 242  242  #include <sys/session.h>
 243  243  #include <sys/cmn_err.h>
 244  244  #include <sys/modhash.h>
 245  245  #include <sys/sunddi.h>
 246  246  #include <sys/nvpair.h>
 247  247  #include <sys/rctl.h>
 248  248  #include <sys/fss.h>
 249  249  #include <sys/brand.h>
 250  250  #include <sys/zone.h>
 251  251  #include <net/if.h>
 252  252  #include <sys/cpucaps.h>
 253  253  #include <vm/seg.h>
 254  254  #include <sys/mac.h>
 255  255  
 256  256  /*
 257  257   * This constant specifies the number of seconds that threads waiting for
 258  258   * subsystems to release a zone's general-purpose references will wait before
 259  259   * they log the zone's reference counts.  The constant's value shouldn't
 260  260   * be so small that reference counts are unnecessarily reported for zones
 261  261   * whose references are slowly released.  On the other hand, it shouldn't be so
 262  262   * large that users reboot their systems out of frustration over hung zones
 263  263   * before the system logs the zones' reference counts.
 264  264   */
 265  265  #define ZONE_DESTROY_TIMEOUT_SECS       60
 266  266  
 267  267  /* List of data link IDs which are accessible from the zone */
 268  268  typedef struct zone_dl {
 269  269          datalink_id_t   zdl_id;
 270  270          nvlist_t        *zdl_net;
 271  271          list_node_t     zdl_linkage;
 272  272  } zone_dl_t;
 273  273  
 274  274  /*
 275  275   * cv used to signal that all references to the zone have been released.  This
 276  276   * needs to be global since there may be multiple waiters, and the first to
 277  277   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  278   */
 279  279  static kcondvar_t zone_destroy_cv;
 280  280  /*
 281  281   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  282   * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  283   */
 284  284  static kmutex_t zone_status_lock;
 285  285  
 286  286  /*
 287  287   * ZSD-related global variables.
 288  288   */
 289  289  static kmutex_t zsd_key_lock;   /* protects the following two */
 290  290  /*
 291  291   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  292   */
 293  293  static zone_key_t zsd_keyval = 0;
 294  294  /*
 295  295   * Global list of registered keys.  We use this when a new zone is created.
 296  296   */
 297  297  static list_t zsd_registered_keys;
 298  298  
 299  299  int zone_hash_size = 256;
 300  300  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301  301  static kmutex_t zonehash_lock;
 302  302  static uint_t zonecount;
 303  303  static id_space_t *zoneid_space;
 304  304  
 305  305  /*
 306  306   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  307   * kernel proper runs, and which manages all other zones.
 308  308   *
 309  309   * Although not declared as static, the variable "zone0" should not be used
 310  310   * except for by code that needs to reference the global zone early on in boot,
 311  311   * before it is fully initialized.  All other consumers should use
 312  312   * 'global_zone'.
 313  313   */
 314  314  zone_t zone0;
 315  315  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316  316  
 317  317  /*
 318  318   * List of active zones, protected by zonehash_lock.
 319  319   */
 320  320  static list_t zone_active;
 321  321  
 322  322  /*
 323  323   * List of destroyed zones that still have outstanding cred references.
 324  324   * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  325   * problems in zone_free.
 326  326   */
 327  327  static list_t zone_deathrow;
 328  328  static kmutex_t zone_deathrow_lock;
 329  329  
 330  330  /* number of zones is limited by virtual interface limit in IP */
 331  331  uint_t maxzones = 8192;
 332  332  
 333  333  /* Event channel to sent zone state change notifications */
 334  334  evchan_t *zone_event_chan;
 335  335  
 336  336  /*
 337  337   * This table holds the mapping from kernel zone states to
 338  338   * states visible in the state notification API.
 339  339   * The idea is that we only expose "obvious" states and
 340  340   * do not expose states which are just implementation details.
 341  341   */
 342  342  const char  *zone_status_table[] = {
 343  343          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344  344          ZONE_EVENT_INITIALIZED,         /* initialized */
 345  345          ZONE_EVENT_READY,               /* ready */
 346  346          ZONE_EVENT_READY,               /* booting */
 347  347          ZONE_EVENT_RUNNING,             /* running */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350  350          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351  351          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352  352          ZONE_EVENT_UNINITIALIZED,       /* dead */
 353  353  };
 354  354  
 355  355  /*
 356  356   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  357   * (see sys/zone.h).
 358  358   */
 359  359  static char *zone_ref_subsys_names[] = {
 360  360          "NFS",          /* ZONE_REF_NFS */
 361  361          "NFSv4",        /* ZONE_REF_NFSV4 */
 362  362          "SMBFS",        /* ZONE_REF_SMBFS */
 363  363          "MNTFS",        /* ZONE_REF_MNTFS */
 364  364          "LOFI",         /* ZONE_REF_LOFI */
 365  365          "VFS",          /* ZONE_REF_VFS */
 366  366          "IPC"           /* ZONE_REF_IPC */
 367  367  };
 368  368  
 369  369  /*
 370  370   * This isn't static so lint doesn't complain.
 371  371   */
 372  372  rctl_hndl_t rc_zone_cpu_shares;
 373  373  rctl_hndl_t rc_zone_locked_mem;
 374  374  rctl_hndl_t rc_zone_max_swap;
 375  375  rctl_hndl_t rc_zone_max_lofi;
 376  376  rctl_hndl_t rc_zone_cpu_cap;
 377  377  rctl_hndl_t rc_zone_nlwps;
 378  378  rctl_hndl_t rc_zone_nprocs;
 379  379  rctl_hndl_t rc_zone_shmmax;
 380  380  rctl_hndl_t rc_zone_shmmni;
 381  381  rctl_hndl_t rc_zone_semmni;
 382  382  rctl_hndl_t rc_zone_msgmni;
 383  383  
 384  384  const char * const zone_default_initname = "/sbin/init";
 385  385  static char * const zone_prefix = "/zone/";
 386  386  static int zone_shutdown(zoneid_t zoneid);
 387  387  static int zone_add_datalink(zoneid_t, datalink_id_t);
 388  388  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389  389  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390  390  static int zone_set_network(zoneid_t, zone_net_data_t *);
 391  391  static int zone_get_network(zoneid_t, zone_net_data_t *);
 392  392  
 393  393  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394  394  
 395  395  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396  396  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397  397  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398  398  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399  399      zone_key_t);
 400  400  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401  401  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404  404      kmutex_t *);
 405  405  
 406  406  /*
 407  407   * Bump this number when you alter the zone syscall interfaces; this is
 408  408   * because we need to have support for previous API versions in libc
 409  409   * to support patching; libc calls into the kernel to determine this number.
 410  410   *
 411  411   * Version 1 of the API is the version originally shipped with Solaris 10

↓ open down ↓

411 lines elided

↑ open up ↑

 412  412   * Version 2 alters the zone_create system call in order to support more
 413  413   *     arguments by moving the args into a structure; and to do better
 414  414   *     error reporting when zone_create() fails.
 415  415   * Version 3 alters the zone_create system call in order to support the
 416  416   *     import of ZFS datasets to zones.
 417  417   * Version 4 alters the zone_create system call in order to support
 418  418   *     Trusted Extensions.
 419  419   * Version 5 alters the zone_boot system call, and converts its old
 420  420   *     bootargs parameter to be set by the zone_setattr API instead.
 421  421   * Version 6 adds the flag argument to zone_create.
      422 + * Version 7 adds the requested zone_did to zone_create.
 422  423   */
 423      -static const int ZONE_SYSCALL_API_VERSION = 6;
      424 +static const int ZONE_SYSCALL_API_VERSION = 7;
 424  425  
 425  426  /*
 426  427   * Certain filesystems (such as NFS and autofs) need to know which zone
 427  428   * the mount is being placed in.  Because of this, we need to be able to
 428  429   * ensure that a zone isn't in the process of being created/destroyed such
 429  430   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  431   * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  432   * mount list. Since a zone can't reside on an NFS file system, we don't
 432  433   * have to worry about the zonepath itself.
 433  434   *

 434  435   * The following functions: block_mounts()/resume_mounts() and
 435  436   * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  437   * layer (respectively) to synchronize zone state transitions and new
 437  438   * mounts within a zone. This syncronization is on a per-zone basis, so
 438  439   * activity for one zone will not interfere with activity for another zone.
 439  440   *
 440  441   * The semantics are like a reader-reader lock such that there may
 441  442   * either be multiple mounts (or zone state transitions, if that weren't
 442  443   * serialized by zonehash_lock) in progress at the same time, but not
 443  444   * both.
 444  445   *
 445  446   * We use cv's so the user can ctrl-C out of the operation if it's
 446  447   * taking too long.
 447  448   *
 448  449   * The semantics are such that there is unfair bias towards the
 449  450   * "current" operation.  This means that zone halt may starve if
 450  451   * there is a rapid succession of new mounts coming in to the zone.
 451  452   */
 452  453  /*
 453  454   * Prevent new mounts from progressing to the point of calling
 454  455   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  456   * them to complete.
 456  457   */
 457  458  static int
 458  459  block_mounts(zone_t *zp)
 459  460  {
 460  461          int retval = 0;
 461  462  
 462  463          /*
 463  464           * Since it may block for a long time, block_mounts() shouldn't be
 464  465           * called with zonehash_lock held.
 465  466           */
 466  467          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467  468          mutex_enter(&zp->zone_mount_lock);
 468  469          while (zp->zone_mounts_in_progress > 0) {
 469  470                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470  471                          goto signaled;
 471  472          }
 472  473          /*
 473  474           * A negative value of mounts_in_progress indicates that mounts
 474  475           * have been blocked by (-mounts_in_progress) different callers
 475  476           * (remotely possible if two threads enter zone_shutdown at the same
 476  477           * time).
 477  478           */
 478  479          zp->zone_mounts_in_progress--;
 479  480          retval = 1;
 480  481  signaled:
 481  482          mutex_exit(&zp->zone_mount_lock);
 482  483          return (retval);
 483  484  }
 484  485  
 485  486  /*
 486  487   * The VFS layer may progress with new mounts as far as we're concerned.
 487  488   * Allow them to progress if we were the last obstacle.
 488  489   */
 489  490  static void
 490  491  resume_mounts(zone_t *zp)
 491  492  {
 492  493          mutex_enter(&zp->zone_mount_lock);
 493  494          if (++zp->zone_mounts_in_progress == 0)
 494  495                  cv_broadcast(&zp->zone_mount_cv);
 495  496          mutex_exit(&zp->zone_mount_lock);
 496  497  }
 497  498  
 498  499  /*
 499  500   * The VFS layer is busy with a mount; this zone should wait until all
 500  501   * of its mounts are completed to progress.
 501  502   */
 502  503  void
 503  504  mount_in_progress(zone_t *zp)
 504  505  {
 505  506          mutex_enter(&zp->zone_mount_lock);
 506  507          while (zp->zone_mounts_in_progress < 0)
 507  508                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508  509          zp->zone_mounts_in_progress++;
 509  510          mutex_exit(&zp->zone_mount_lock);
 510  511  }
 511  512  
 512  513  /*
 513  514   * VFS is done with one mount; wake up any waiting block_mounts()
 514  515   * callers if this is the last mount.
 515  516   */
 516  517  void
 517  518  mount_completed(zone_t *zp)
 518  519  {
 519  520          mutex_enter(&zp->zone_mount_lock);
 520  521          if (--zp->zone_mounts_in_progress == 0)
 521  522                  cv_broadcast(&zp->zone_mount_cv);
 522  523          mutex_exit(&zp->zone_mount_lock);
 523  524  }
 524  525  
 525  526  /*
 526  527   * ZSD routines.
 527  528   *
 528  529   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  530   * defined by the pthread_key_create() and related interfaces.
 530  531   *
 531  532   * Kernel subsystems may register one or more data items and/or
 532  533   * callbacks to be executed when a zone is created, shutdown, or
 533  534   * destroyed.
 534  535   *
 535  536   * Unlike the thread counterpart, destructor callbacks will be executed
 536  537   * even if the data pointer is NULL and/or there are no constructor
 537  538   * callbacks, so it is the responsibility of such callbacks to check for
 538  539   * NULL data values if necessary.
 539  540   *
 540  541   * The locking strategy and overall picture is as follows:
 541  542   *
 542  543   * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  544   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  545   * holding that lock all the existing zones are marked as
 545  546   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  547   * zone_zsd list (protected by zone_lock). The global list is updated first
 547  548   * (under zone_key_lock) to make sure that newly created zones use the
 548  549   * most recent list of keys. Then under zonehash_lock we walk the zones
 549  550   * and mark them.  Similar locking is used in zone_key_delete().
 550  551   *
 551  552   * The actual create, shutdown, and destroy callbacks are done without
 552  553   * holding any lock. And zsd_flags are used to ensure that the operations
 553  554   * completed so that when zone_key_create (and zone_create) is done, as well as
 554  555   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  556   * are completed.
 556  557   *
 557  558   * When new zones are created constructor callbacks for all registered ZSD
 558  559   * entries will be called. That also uses the above two phases of marking
 559  560   * what needs to be done, and then running the callbacks without holding
 560  561   * any locks.
 561  562   *
 562  563   * The framework does not provide any locking around zone_getspecific() and
 563  564   * zone_setspecific() apart from that needed for internal consistency, so
 564  565   * callers interested in atomic "test-and-set" semantics will need to provide
 565  566   * their own locking.
 566  567   */
 567  568  
 568  569  /*
 569  570   * Helper function to find the zsd_entry associated with the key in the
 570  571   * given list.
 571  572   */
 572  573  static struct zsd_entry *
 573  574  zsd_find(list_t *l, zone_key_t key)
 574  575  {
 575  576          struct zsd_entry *zsd;
 576  577  
 577  578          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578  579                  if (zsd->zsd_key == key) {
 579  580                          return (zsd);
 580  581                  }
 581  582          }
 582  583          return (NULL);
 583  584  }
 584  585  
 585  586  /*
 586  587   * Helper function to find the zsd_entry associated with the key in the
 587  588   * given list. Move it to the front of the list.
 588  589   */
 589  590  static struct zsd_entry *
 590  591  zsd_find_mru(list_t *l, zone_key_t key)
 591  592  {
 592  593          struct zsd_entry *zsd;
 593  594  
 594  595          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595  596                  if (zsd->zsd_key == key) {
 596  597                          /*
 597  598                           * Move to head of list to keep list in MRU order.
 598  599                           */
 599  600                          if (zsd != list_head(l)) {
 600  601                                  list_remove(l, zsd);
 601  602                                  list_insert_head(l, zsd);
 602  603                          }
 603  604                          return (zsd);
 604  605                  }
 605  606          }
 606  607          return (NULL);
 607  608  }
 608  609  
 609  610  void
 610  611  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611  612      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612  613  {
 613  614          struct zsd_entry *zsdp;
 614  615          struct zsd_entry *t;
 615  616          struct zone *zone;
 616  617          zone_key_t  key;
 617  618  
 618  619          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619  620          zsdp->zsd_data = NULL;
 620  621          zsdp->zsd_create = create;
 621  622          zsdp->zsd_shutdown = shutdown;
 622  623          zsdp->zsd_destroy = destroy;
 623  624  
 624  625          /*
 625  626           * Insert in global list of callbacks. Makes future zone creations
 626  627           * see it.
 627  628           */
 628  629          mutex_enter(&zsd_key_lock);
 629  630          key = zsdp->zsd_key = ++zsd_keyval;
 630  631          ASSERT(zsd_keyval != 0);
 631  632          list_insert_tail(&zsd_registered_keys, zsdp);
 632  633          mutex_exit(&zsd_key_lock);
 633  634  
 634  635          /*
 635  636           * Insert for all existing zones and mark them as needing
 636  637           * a create callback.
 637  638           */
 638  639          mutex_enter(&zonehash_lock);    /* stop the world */
 639  640          for (zone = list_head(&zone_active); zone != NULL;
 640  641              zone = list_next(&zone_active, zone)) {
 641  642                  zone_status_t status;
 642  643  
 643  644                  mutex_enter(&zone->zone_lock);
 644  645  
 645  646                  /* Skip zones that are on the way down or not yet up */
 646  647                  status = zone_status_get(zone);
 647  648                  if (status >= ZONE_IS_DOWN ||
 648  649                      status == ZONE_IS_UNINITIALIZED) {
 649  650                          mutex_exit(&zone->zone_lock);
 650  651                          continue;
 651  652                  }
 652  653  
 653  654                  t = zsd_find_mru(&zone->zone_zsd, key);
 654  655                  if (t != NULL) {
 655  656                          /*
 656  657                           * A zsd_configure already inserted it after
 657  658                           * we dropped zsd_key_lock above.
 658  659                           */
 659  660                          mutex_exit(&zone->zone_lock);
 660  661                          continue;
 661  662                  }
 662  663                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663  664                  t->zsd_key = key;
 664  665                  t->zsd_create = create;
 665  666                  t->zsd_shutdown = shutdown;
 666  667                  t->zsd_destroy = destroy;
 667  668                  if (create != NULL) {
 668  669                          t->zsd_flags = ZSD_CREATE_NEEDED;
 669  670                          DTRACE_PROBE2(zsd__create__needed,
 670  671                              zone_t *, zone, zone_key_t, key);
 671  672                  }
 672  673                  list_insert_tail(&zone->zone_zsd, t);
 673  674                  mutex_exit(&zone->zone_lock);
 674  675          }
 675  676          mutex_exit(&zonehash_lock);
 676  677  
 677  678          if (create != NULL) {
 678  679                  /* Now call the create callback for this key */
 679  680                  zsd_apply_all_zones(zsd_apply_create, key);
 680  681          }
 681  682          /*
 682  683           * It is safe for consumers to use the key now, make it
 683  684           * globally visible. Specifically zone_getspecific() will
 684  685           * always successfully return the zone specific data associated
 685  686           * with the key.
 686  687           */
 687  688          *keyp = key;
 688  689  
 689  690  }
 690  691  
 691  692  /*
 692  693   * Function called when a module is being unloaded, or otherwise wishes
 693  694   * to unregister its ZSD key and callbacks.
 694  695   *
 695  696   * Remove from the global list and determine the functions that need to
 696  697   * be called under a global lock. Then call the functions without
 697  698   * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  699   * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  700   */
 700  701  int
 701  702  zone_key_delete(zone_key_t key)
 702  703  {
 703  704          struct zsd_entry *zsdp = NULL;
 704  705          zone_t *zone;
 705  706  
 706  707          mutex_enter(&zsd_key_lock);
 707  708          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708  709          if (zsdp == NULL) {
 709  710                  mutex_exit(&zsd_key_lock);
 710  711                  return (-1);
 711  712          }
 712  713          list_remove(&zsd_registered_keys, zsdp);
 713  714          mutex_exit(&zsd_key_lock);
 714  715  
 715  716          mutex_enter(&zonehash_lock);
 716  717          for (zone = list_head(&zone_active); zone != NULL;
 717  718              zone = list_next(&zone_active, zone)) {
 718  719                  struct zsd_entry *del;
 719  720  
 720  721                  mutex_enter(&zone->zone_lock);
 721  722                  del = zsd_find_mru(&zone->zone_zsd, key);
 722  723                  if (del == NULL) {
 723  724                          /*
 724  725                           * Somebody else got here first e.g the zone going
 725  726                           * away.
 726  727                           */
 727  728                          mutex_exit(&zone->zone_lock);
 728  729                          continue;
 729  730                  }
 730  731                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731  732                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732  733                  if (del->zsd_shutdown != NULL &&
 733  734                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734  735                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735  736                          DTRACE_PROBE2(zsd__shutdown__needed,
 736  737                              zone_t *, zone, zone_key_t, key);
 737  738                  }
 738  739                  if (del->zsd_destroy != NULL &&
 739  740                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740  741                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741  742                          DTRACE_PROBE2(zsd__destroy__needed,
 742  743                              zone_t *, zone, zone_key_t, key);
 743  744                  }
 744  745                  mutex_exit(&zone->zone_lock);
 745  746          }
 746  747          mutex_exit(&zonehash_lock);
 747  748          kmem_free(zsdp, sizeof (*zsdp));
 748  749  
 749  750          /* Now call the shutdown and destroy callback for this key */
 750  751          zsd_apply_all_zones(zsd_apply_shutdown, key);
 751  752          zsd_apply_all_zones(zsd_apply_destroy, key);
 752  753  
 753  754          /* Now we can free up the zsdp structures in each zone */
 754  755          mutex_enter(&zonehash_lock);
 755  756          for (zone = list_head(&zone_active); zone != NULL;
 756  757              zone = list_next(&zone_active, zone)) {
 757  758                  struct zsd_entry *del;
 758  759  
 759  760                  mutex_enter(&zone->zone_lock);
 760  761                  del = zsd_find(&zone->zone_zsd, key);
 761  762                  if (del != NULL) {
 762  763                          list_remove(&zone->zone_zsd, del);
 763  764                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764  765                          kmem_free(del, sizeof (*del));
 765  766                  }
 766  767                  mutex_exit(&zone->zone_lock);
 767  768          }
 768  769          mutex_exit(&zonehash_lock);
 769  770  
 770  771          return (0);
 771  772  }
 772  773  
 773  774  /*
 774  775   * ZSD counterpart of pthread_setspecific().
 775  776   *
 776  777   * Since all zsd callbacks, including those with no create function,
 777  778   * have an entry in zone_zsd, if the key is registered it is part of
 778  779   * the zone_zsd list.
 779  780   * Return an error if the key wasn't registerd.
 780  781   */
 781  782  int
 782  783  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783  784  {
 784  785          struct zsd_entry *t;
 785  786  
 786  787          mutex_enter(&zone->zone_lock);
 787  788          t = zsd_find_mru(&zone->zone_zsd, key);
 788  789          if (t != NULL) {
 789  790                  /*
 790  791                   * Replace old value with new
 791  792                   */
 792  793                  t->zsd_data = (void *)data;
 793  794                  mutex_exit(&zone->zone_lock);
 794  795                  return (0);
 795  796          }
 796  797          mutex_exit(&zone->zone_lock);
 797  798          return (-1);
 798  799  }
 799  800  
 800  801  /*
 801  802   * ZSD counterpart of pthread_getspecific().
 802  803   */
 803  804  void *
 804  805  zone_getspecific(zone_key_t key, zone_t *zone)
 805  806  {
 806  807          struct zsd_entry *t;
 807  808          void *data;
 808  809  
 809  810          mutex_enter(&zone->zone_lock);
 810  811          t = zsd_find_mru(&zone->zone_zsd, key);
 811  812          data = (t == NULL ? NULL : t->zsd_data);
 812  813          mutex_exit(&zone->zone_lock);
 813  814          return (data);
 814  815  }
 815  816  
 816  817  /*
 817  818   * Function used to initialize a zone's list of ZSD callbacks and data
 818  819   * when the zone is being created.  The callbacks are initialized from
 819  820   * the template list (zsd_registered_keys). The constructor callback is
 820  821   * executed later (once the zone exists and with locks dropped).
 821  822   */
 822  823  static void
 823  824  zone_zsd_configure(zone_t *zone)
 824  825  {
 825  826          struct zsd_entry *zsdp;
 826  827          struct zsd_entry *t;
 827  828  
 828  829          ASSERT(MUTEX_HELD(&zonehash_lock));
 829  830          ASSERT(list_head(&zone->zone_zsd) == NULL);
 830  831          mutex_enter(&zone->zone_lock);
 831  832          mutex_enter(&zsd_key_lock);
 832  833          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833  834              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834  835                  /*
 835  836                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836  837                   * should not have added anything to it.
 837  838                   */
 838  839                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839  840  
 840  841                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841  842                  t->zsd_key = zsdp->zsd_key;
 842  843                  t->zsd_create = zsdp->zsd_create;
 843  844                  t->zsd_shutdown = zsdp->zsd_shutdown;
 844  845                  t->zsd_destroy = zsdp->zsd_destroy;
 845  846                  if (zsdp->zsd_create != NULL) {
 846  847                          t->zsd_flags = ZSD_CREATE_NEEDED;
 847  848                          DTRACE_PROBE2(zsd__create__needed,
 848  849                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849  850                  }
 850  851                  list_insert_tail(&zone->zone_zsd, t);
 851  852          }
 852  853          mutex_exit(&zsd_key_lock);
 853  854          mutex_exit(&zone->zone_lock);
 854  855  }
 855  856  
 856  857  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857  858  
 858  859  /*
 859  860   * Helper function to execute shutdown or destructor callbacks.
 860  861   */
 861  862  static void
 862  863  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863  864  {
 864  865          struct zsd_entry *t;
 865  866  
 866  867          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867  868          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868  869          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869  870  
 870  871          /*
 871  872           * Run the callback solely based on what is registered for the zone
 872  873           * in zone_zsd. The global list can change independently of this
 873  874           * as keys are registered and unregistered and we don't register new
 874  875           * callbacks for a zone that is in the process of going away.
 875  876           */
 876  877          mutex_enter(&zone->zone_lock);
 877  878          for (t = list_head(&zone->zone_zsd); t != NULL;
 878  879              t = list_next(&zone->zone_zsd, t)) {
 879  880                  zone_key_t key = t->zsd_key;
 880  881  
 881  882                  /* Skip if no callbacks registered */
 882  883  
 883  884                  if (ct == ZSD_SHUTDOWN) {
 884  885                          if (t->zsd_shutdown != NULL &&
 885  886                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886  887                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887  888                                  DTRACE_PROBE2(zsd__shutdown__needed,
 888  889                                      zone_t *, zone, zone_key_t, key);
 889  890                          }
 890  891                  } else {
 891  892                          if (t->zsd_destroy != NULL &&
 892  893                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893  894                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894  895                                  DTRACE_PROBE2(zsd__destroy__needed,
 895  896                                      zone_t *, zone, zone_key_t, key);
 896  897                          }
 897  898                  }
 898  899          }
 899  900          mutex_exit(&zone->zone_lock);
 900  901  
 901  902          /* Now call the shutdown and destroy callback for this key */
 902  903          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903  904          zsd_apply_all_keys(zsd_apply_destroy, zone);
 904  905  
 905  906  }
 906  907  
 907  908  /*
 908  909   * Called when the zone is going away; free ZSD-related memory, and
 909  910   * destroy the zone_zsd list.
 910  911   */
 911  912  static void
 912  913  zone_free_zsd(zone_t *zone)
 913  914  {
 914  915          struct zsd_entry *t, *next;
 915  916  
 916  917          /*
 917  918           * Free all the zsd_entry's we had on this zone.
 918  919           */
 919  920          mutex_enter(&zone->zone_lock);
 920  921          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921  922                  next = list_next(&zone->zone_zsd, t);
 922  923                  list_remove(&zone->zone_zsd, t);
 923  924                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924  925                  kmem_free(t, sizeof (*t));
 925  926          }
 926  927          list_destroy(&zone->zone_zsd);
 927  928          mutex_exit(&zone->zone_lock);
 928  929  
 929  930  }
 930  931  
 931  932  /*
 932  933   * Apply a function to all zones for particular key value.
 933  934   *
 934  935   * The applyfn has to drop zonehash_lock if it does some work, and
 935  936   * then reacquire it before it returns.
 936  937   * When the lock is dropped we don't follow list_next even
 937  938   * if it is possible to do so without any hazards. This is
 938  939   * because we want the design to allow for the list of zones
 939  940   * to change in any arbitrary way during the time the
 940  941   * lock was dropped.
 941  942   *
 942  943   * It is safe to restart the loop at list_head since the applyfn
 943  944   * changes the zsd_flags as it does work, so a subsequent
 944  945   * pass through will have no effect in applyfn, hence the loop will terminate
 945  946   * in at worst O(N^2).
 946  947   */
 947  948  static void
 948  949  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949  950  {
 950  951          zone_t *zone;
 951  952  
 952  953          mutex_enter(&zonehash_lock);
 953  954          zone = list_head(&zone_active);
 954  955          while (zone != NULL) {
 955  956                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956  957                          /* Lock dropped - restart at head */
 957  958                          zone = list_head(&zone_active);
 958  959                  } else {
 959  960                          zone = list_next(&zone_active, zone);
 960  961                  }
 961  962          }
 962  963          mutex_exit(&zonehash_lock);
 963  964  }
 964  965  
 965  966  /*
 966  967   * Apply a function to all keys for a particular zone.
 967  968   *
 968  969   * The applyfn has to drop zonehash_lock if it does some work, and
 969  970   * then reacquire it before it returns.
 970  971   * When the lock is dropped we don't follow list_next even
 971  972   * if it is possible to do so without any hazards. This is
 972  973   * because we want the design to allow for the list of zsd callbacks
 973  974   * to change in any arbitrary way during the time the
 974  975   * lock was dropped.
 975  976   *
 976  977   * It is safe to restart the loop at list_head since the applyfn
 977  978   * changes the zsd_flags as it does work, so a subsequent
 978  979   * pass through will have no effect in applyfn, hence the loop will terminate
 979  980   * in at worst O(N^2).
 980  981   */
 981  982  static void
 982  983  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983  984  {
 984  985          struct zsd_entry *t;
 985  986  
 986  987          mutex_enter(&zone->zone_lock);
 987  988          t = list_head(&zone->zone_zsd);
 988  989          while (t != NULL) {
 989  990                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990  991                          /* Lock dropped - restart at head */
 991  992                          t = list_head(&zone->zone_zsd);
 992  993                  } else {
 993  994                          t = list_next(&zone->zone_zsd, t);
 994  995                  }
 995  996          }
 996  997          mutex_exit(&zone->zone_lock);
 997  998  }
 998  999  
 999 1000  /*
1000 1001   * Call the create function for the zone and key if CREATE_NEEDED
1001 1002   * is set.
1002 1003   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003 1004   * we wait for that thread to complete so that we can ensure that
1004 1005   * all the callbacks are done when we've looped over all zones/keys.
1005 1006   *
1006 1007   * When we call the create function, we drop the global held by the
1007 1008   * caller, and return true to tell the caller it needs to re-evalute the
1008 1009   * state.
1009 1010   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010 1011   * remains held on exit.
1011 1012   */
1012 1013  static boolean_t
1013 1014  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014 1015      zone_t *zone, zone_key_t key)
1015 1016  {
1016 1017          void *result;
1017 1018          struct zsd_entry *t;
1018 1019          boolean_t dropped;
1019 1020  
1020 1021          if (lockp != NULL) {
1021 1022                  ASSERT(MUTEX_HELD(lockp));
1022 1023          }
1023 1024          if (zone_lock_held) {
1024 1025                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1025 1026          } else {
1026 1027                  mutex_enter(&zone->zone_lock);
1027 1028          }
1028 1029  
1029 1030          t = zsd_find(&zone->zone_zsd, key);
1030 1031          if (t == NULL) {
1031 1032                  /*
1032 1033                   * Somebody else got here first e.g the zone going
1033 1034                   * away.
1034 1035                   */
1035 1036                  if (!zone_lock_held)
1036 1037                          mutex_exit(&zone->zone_lock);
1037 1038                  return (B_FALSE);
1038 1039          }
1039 1040          dropped = B_FALSE;
1040 1041          if (zsd_wait_for_inprogress(zone, t, lockp))
1041 1042                  dropped = B_TRUE;
1042 1043  
1043 1044          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044 1045                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045 1046                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046 1047                  DTRACE_PROBE2(zsd__create__inprogress,
1047 1048                      zone_t *, zone, zone_key_t, key);
1048 1049                  mutex_exit(&zone->zone_lock);
1049 1050                  if (lockp != NULL)
1050 1051                          mutex_exit(lockp);
1051 1052  
1052 1053                  dropped = B_TRUE;
1053 1054                  ASSERT(t->zsd_create != NULL);
1054 1055                  DTRACE_PROBE2(zsd__create__start,
1055 1056                      zone_t *, zone, zone_key_t, key);
1056 1057  
1057 1058                  result = (*t->zsd_create)(zone->zone_id);
1058 1059  
1059 1060                  DTRACE_PROBE2(zsd__create__end,
1060 1061                      zone_t *, zone, voidn *, result);
1061 1062  
1062 1063                  ASSERT(result != NULL);
1063 1064                  if (lockp != NULL)
1064 1065                          mutex_enter(lockp);
1065 1066                  mutex_enter(&zone->zone_lock);
1066 1067                  t->zsd_data = result;
1067 1068                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068 1069                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069 1070                  cv_broadcast(&t->zsd_cv);
1070 1071                  DTRACE_PROBE2(zsd__create__completed,
1071 1072                      zone_t *, zone, zone_key_t, key);
1072 1073          }
1073 1074          if (!zone_lock_held)
1074 1075                  mutex_exit(&zone->zone_lock);
1075 1076          return (dropped);
1076 1077  }
1077 1078  
1078 1079  /*
1079 1080   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080 1081   * is set.
1081 1082   * If some other thread gets here first and sets *_INPROGRESS, then
1082 1083   * we wait for that thread to complete so that we can ensure that
1083 1084   * all the callbacks are done when we've looped over all zones/keys.
1084 1085   *
1085 1086   * When we call the shutdown function, we drop the global held by the
1086 1087   * caller, and return true to tell the caller it needs to re-evalute the
1087 1088   * state.
1088 1089   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089 1090   * remains held on exit.
1090 1091   */
1091 1092  static boolean_t
1092 1093  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093 1094      zone_t *zone, zone_key_t key)
1094 1095  {
1095 1096          struct zsd_entry *t;
1096 1097          void *data;
1097 1098          boolean_t dropped;
1098 1099  
1099 1100          if (lockp != NULL) {
1100 1101                  ASSERT(MUTEX_HELD(lockp));
1101 1102          }
1102 1103          if (zone_lock_held) {
1103 1104                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1104 1105          } else {
1105 1106                  mutex_enter(&zone->zone_lock);
1106 1107          }
1107 1108  
1108 1109          t = zsd_find(&zone->zone_zsd, key);
1109 1110          if (t == NULL) {
1110 1111                  /*
1111 1112                   * Somebody else got here first e.g the zone going
1112 1113                   * away.
1113 1114                   */
1114 1115                  if (!zone_lock_held)
1115 1116                          mutex_exit(&zone->zone_lock);
1116 1117                  return (B_FALSE);
1117 1118          }
1118 1119          dropped = B_FALSE;
1119 1120          if (zsd_wait_for_creator(zone, t, lockp))
1120 1121                  dropped = B_TRUE;
1121 1122  
1122 1123          if (zsd_wait_for_inprogress(zone, t, lockp))
1123 1124                  dropped = B_TRUE;
1124 1125  
1125 1126          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126 1127                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127 1128                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128 1129                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1129 1130                      zone_t *, zone, zone_key_t, key);
1130 1131                  mutex_exit(&zone->zone_lock);
1131 1132                  if (lockp != NULL)
1132 1133                          mutex_exit(lockp);
1133 1134                  dropped = B_TRUE;
1134 1135  
1135 1136                  ASSERT(t->zsd_shutdown != NULL);
1136 1137                  data = t->zsd_data;
1137 1138  
1138 1139                  DTRACE_PROBE2(zsd__shutdown__start,
1139 1140                      zone_t *, zone, zone_key_t, key);
1140 1141  
1141 1142                  (t->zsd_shutdown)(zone->zone_id, data);
1142 1143                  DTRACE_PROBE2(zsd__shutdown__end,
1143 1144                      zone_t *, zone, zone_key_t, key);
1144 1145  
1145 1146                  if (lockp != NULL)
1146 1147                          mutex_enter(lockp);
1147 1148                  mutex_enter(&zone->zone_lock);
1148 1149                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149 1150                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150 1151                  cv_broadcast(&t->zsd_cv);
1151 1152                  DTRACE_PROBE2(zsd__shutdown__completed,
1152 1153                      zone_t *, zone, zone_key_t, key);
1153 1154          }
1154 1155          if (!zone_lock_held)
1155 1156                  mutex_exit(&zone->zone_lock);
1156 1157          return (dropped);
1157 1158  }
1158 1159  
1159 1160  /*
1160 1161   * Call the destroy function for the zone and key if DESTROY_NEEDED
1161 1162   * is set.
1162 1163   * If some other thread gets here first and sets *_INPROGRESS, then
1163 1164   * we wait for that thread to complete so that we can ensure that
1164 1165   * all the callbacks are done when we've looped over all zones/keys.
1165 1166   *
1166 1167   * When we call the destroy function, we drop the global held by the
1167 1168   * caller, and return true to tell the caller it needs to re-evalute the
1168 1169   * state.
1169 1170   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170 1171   * remains held on exit.
1171 1172   */
1172 1173  static boolean_t
1173 1174  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174 1175      zone_t *zone, zone_key_t key)
1175 1176  {
1176 1177          struct zsd_entry *t;
1177 1178          void *data;
1178 1179          boolean_t dropped;
1179 1180  
1180 1181          if (lockp != NULL) {
1181 1182                  ASSERT(MUTEX_HELD(lockp));
1182 1183          }
1183 1184          if (zone_lock_held) {
1184 1185                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1185 1186          } else {
1186 1187                  mutex_enter(&zone->zone_lock);
1187 1188          }
1188 1189  
1189 1190          t = zsd_find(&zone->zone_zsd, key);
1190 1191          if (t == NULL) {
1191 1192                  /*
1192 1193                   * Somebody else got here first e.g the zone going
1193 1194                   * away.
1194 1195                   */
1195 1196                  if (!zone_lock_held)
1196 1197                          mutex_exit(&zone->zone_lock);
1197 1198                  return (B_FALSE);
1198 1199          }
1199 1200          dropped = B_FALSE;
1200 1201          if (zsd_wait_for_creator(zone, t, lockp))
1201 1202                  dropped = B_TRUE;
1202 1203  
1203 1204          if (zsd_wait_for_inprogress(zone, t, lockp))
1204 1205                  dropped = B_TRUE;
1205 1206  
1206 1207          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207 1208                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208 1209                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209 1210                  DTRACE_PROBE2(zsd__destroy__inprogress,
1210 1211                      zone_t *, zone, zone_key_t, key);
1211 1212                  mutex_exit(&zone->zone_lock);
1212 1213                  if (lockp != NULL)
1213 1214                          mutex_exit(lockp);
1214 1215                  dropped = B_TRUE;
1215 1216  
1216 1217                  ASSERT(t->zsd_destroy != NULL);
1217 1218                  data = t->zsd_data;
1218 1219                  DTRACE_PROBE2(zsd__destroy__start,
1219 1220                      zone_t *, zone, zone_key_t, key);
1220 1221  
1221 1222                  (t->zsd_destroy)(zone->zone_id, data);
1222 1223                  DTRACE_PROBE2(zsd__destroy__end,
1223 1224                      zone_t *, zone, zone_key_t, key);
1224 1225  
1225 1226                  if (lockp != NULL)
1226 1227                          mutex_enter(lockp);
1227 1228                  mutex_enter(&zone->zone_lock);
1228 1229                  t->zsd_data = NULL;
1229 1230                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230 1231                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231 1232                  cv_broadcast(&t->zsd_cv);
1232 1233                  DTRACE_PROBE2(zsd__destroy__completed,
1233 1234                      zone_t *, zone, zone_key_t, key);
1234 1235          }
1235 1236          if (!zone_lock_held)
1236 1237                  mutex_exit(&zone->zone_lock);
1237 1238          return (dropped);
1238 1239  }
1239 1240  
1240 1241  /*
1241 1242   * Wait for any CREATE_NEEDED flag to be cleared.
1242 1243   * Returns true if lockp was temporarily dropped while waiting.
1243 1244   */
1244 1245  static boolean_t
1245 1246  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 1247  {
1247 1248          boolean_t dropped = B_FALSE;
1248 1249  
1249 1250          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250 1251                  DTRACE_PROBE2(zsd__wait__for__creator,
1251 1252                      zone_t *, zone, struct zsd_entry *, t);
1252 1253                  if (lockp != NULL) {
1253 1254                          dropped = B_TRUE;
1254 1255                          mutex_exit(lockp);
1255 1256                  }
1256 1257                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1257 1258                  if (lockp != NULL) {
1258 1259                          /* First drop zone_lock to preserve order */
1259 1260                          mutex_exit(&zone->zone_lock);
1260 1261                          mutex_enter(lockp);
1261 1262                          mutex_enter(&zone->zone_lock);
1262 1263                  }
1263 1264          }
1264 1265          return (dropped);
1265 1266  }
1266 1267  
1267 1268  /*
1268 1269   * Wait for any INPROGRESS flag to be cleared.
1269 1270   * Returns true if lockp was temporarily dropped while waiting.
1270 1271   */
1271 1272  static boolean_t
1272 1273  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 1274  {
1274 1275          boolean_t dropped = B_FALSE;
1275 1276  
1276 1277          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277 1278                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1278 1279                      zone_t *, zone, struct zsd_entry *, t);
1279 1280                  if (lockp != NULL) {
1280 1281                          dropped = B_TRUE;
1281 1282                          mutex_exit(lockp);
1282 1283                  }
1283 1284                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1284 1285                  if (lockp != NULL) {
1285 1286                          /* First drop zone_lock to preserve order */
1286 1287                          mutex_exit(&zone->zone_lock);
1287 1288                          mutex_enter(lockp);
1288 1289                          mutex_enter(&zone->zone_lock);
1289 1290                  }
1290 1291          }
1291 1292          return (dropped);
1292 1293  }
1293 1294  
1294 1295  /*
1295 1296   * Frees memory associated with the zone dataset list.
1296 1297   */
1297 1298  static void
1298 1299  zone_free_datasets(zone_t *zone)
1299 1300  {
1300 1301          zone_dataset_t *t, *next;
1301 1302  
1302 1303          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303 1304                  next = list_next(&zone->zone_datasets, t);
1304 1305                  list_remove(&zone->zone_datasets, t);
1305 1306                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306 1307                  kmem_free(t, sizeof (*t));
1307 1308          }
1308 1309          list_destroy(&zone->zone_datasets);
1309 1310  }
1310 1311  
1311 1312  /*
1312 1313   * zone.cpu-shares resource control support.
1313 1314   */
1314 1315  /*ARGSUSED*/
1315 1316  static rctl_qty_t
1316 1317  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 1318  {
1318 1319          ASSERT(MUTEX_HELD(&p->p_lock));
1319 1320          return (p->p_zone->zone_shares);
1320 1321  }
1321 1322  
1322 1323  /*ARGSUSED*/
1323 1324  static int
1324 1325  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325 1326      rctl_qty_t nv)
1326 1327  {
1327 1328          ASSERT(MUTEX_HELD(&p->p_lock));
1328 1329          ASSERT(e->rcep_t == RCENTITY_ZONE);
1329 1330          if (e->rcep_p.zone == NULL)
1330 1331                  return (0);
1331 1332  
1332 1333          e->rcep_p.zone->zone_shares = nv;
1333 1334          return (0);
1334 1335  }
1335 1336  
1336 1337  static rctl_ops_t zone_cpu_shares_ops = {
1337 1338          rcop_no_action,
1338 1339          zone_cpu_shares_usage,
1339 1340          zone_cpu_shares_set,
1340 1341          rcop_no_test
1341 1342  };
1342 1343  
1343 1344  /*
1344 1345   * zone.cpu-cap resource control support.
1345 1346   */
1346 1347  /*ARGSUSED*/
1347 1348  static rctl_qty_t
1348 1349  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 1350  {
1350 1351          ASSERT(MUTEX_HELD(&p->p_lock));
1351 1352          return (cpucaps_zone_get(p->p_zone));
1352 1353  }
1353 1354  
1354 1355  /*ARGSUSED*/
1355 1356  static int
1356 1357  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357 1358      rctl_qty_t nv)
1358 1359  {
1359 1360          zone_t *zone = e->rcep_p.zone;
1360 1361  
1361 1362          ASSERT(MUTEX_HELD(&p->p_lock));
1362 1363          ASSERT(e->rcep_t == RCENTITY_ZONE);
1363 1364  
1364 1365          if (zone == NULL)
1365 1366                  return (0);
1366 1367  
1367 1368          /*
1368 1369           * set cap to the new value.
1369 1370           */
1370 1371          return (cpucaps_zone_set(zone, nv));
1371 1372  }
1372 1373  
1373 1374  static rctl_ops_t zone_cpu_cap_ops = {
1374 1375          rcop_no_action,
1375 1376          zone_cpu_cap_get,
1376 1377          zone_cpu_cap_set,
1377 1378          rcop_no_test
1378 1379  };
1379 1380  
1380 1381  /*ARGSUSED*/
1381 1382  static rctl_qty_t
1382 1383  zone_lwps_usage(rctl_t *r, proc_t *p)
1383 1384  {
1384 1385          rctl_qty_t nlwps;
1385 1386          zone_t *zone = p->p_zone;
1386 1387  
1387 1388          ASSERT(MUTEX_HELD(&p->p_lock));
1388 1389  
1389 1390          mutex_enter(&zone->zone_nlwps_lock);
1390 1391          nlwps = zone->zone_nlwps;
1391 1392          mutex_exit(&zone->zone_nlwps_lock);
1392 1393  
1393 1394          return (nlwps);
1394 1395  }
1395 1396  
1396 1397  /*ARGSUSED*/
1397 1398  static int
1398 1399  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399 1400      rctl_qty_t incr, uint_t flags)
1400 1401  {
1401 1402          rctl_qty_t nlwps;
1402 1403  
1403 1404          ASSERT(MUTEX_HELD(&p->p_lock));
1404 1405          ASSERT(e->rcep_t == RCENTITY_ZONE);
1405 1406          if (e->rcep_p.zone == NULL)
1406 1407                  return (0);
1407 1408          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408 1409          nlwps = e->rcep_p.zone->zone_nlwps;
1409 1410  
1410 1411          if (nlwps + incr > rcntl->rcv_value)
1411 1412                  return (1);
1412 1413  
1413 1414          return (0);
1414 1415  }
1415 1416  
1416 1417  /*ARGSUSED*/
1417 1418  static int
1418 1419  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 1420  {
1420 1421          ASSERT(MUTEX_HELD(&p->p_lock));
1421 1422          ASSERT(e->rcep_t == RCENTITY_ZONE);
1422 1423          if (e->rcep_p.zone == NULL)
1423 1424                  return (0);
1424 1425          e->rcep_p.zone->zone_nlwps_ctl = nv;
1425 1426          return (0);
1426 1427  }
1427 1428  
1428 1429  static rctl_ops_t zone_lwps_ops = {
1429 1430          rcop_no_action,
1430 1431          zone_lwps_usage,
1431 1432          zone_lwps_set,
1432 1433          zone_lwps_test,
1433 1434  };
1434 1435  
1435 1436  /*ARGSUSED*/
1436 1437  static rctl_qty_t
1437 1438  zone_procs_usage(rctl_t *r, proc_t *p)
1438 1439  {
1439 1440          rctl_qty_t nprocs;
1440 1441          zone_t *zone = p->p_zone;
1441 1442  
1442 1443          ASSERT(MUTEX_HELD(&p->p_lock));
1443 1444  
1444 1445          mutex_enter(&zone->zone_nlwps_lock);
1445 1446          nprocs = zone->zone_nprocs;
1446 1447          mutex_exit(&zone->zone_nlwps_lock);
1447 1448  
1448 1449          return (nprocs);
1449 1450  }
1450 1451  
1451 1452  /*ARGSUSED*/
1452 1453  static int
1453 1454  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454 1455      rctl_qty_t incr, uint_t flags)
1455 1456  {
1456 1457          rctl_qty_t nprocs;
1457 1458  
1458 1459          ASSERT(MUTEX_HELD(&p->p_lock));
1459 1460          ASSERT(e->rcep_t == RCENTITY_ZONE);
1460 1461          if (e->rcep_p.zone == NULL)
1461 1462                  return (0);
1462 1463          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463 1464          nprocs = e->rcep_p.zone->zone_nprocs;
1464 1465  
1465 1466          if (nprocs + incr > rcntl->rcv_value)
1466 1467                  return (1);
1467 1468  
1468 1469          return (0);
1469 1470  }
1470 1471  
1471 1472  /*ARGSUSED*/
1472 1473  static int
1473 1474  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 1475  {
1475 1476          ASSERT(MUTEX_HELD(&p->p_lock));
1476 1477          ASSERT(e->rcep_t == RCENTITY_ZONE);
1477 1478          if (e->rcep_p.zone == NULL)
1478 1479                  return (0);
1479 1480          e->rcep_p.zone->zone_nprocs_ctl = nv;
1480 1481          return (0);
1481 1482  }
1482 1483  
1483 1484  static rctl_ops_t zone_procs_ops = {
1484 1485          rcop_no_action,
1485 1486          zone_procs_usage,
1486 1487          zone_procs_set,
1487 1488          zone_procs_test,
1488 1489  };
1489 1490  
1490 1491  /*ARGSUSED*/
1491 1492  static rctl_qty_t
1492 1493  zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 1494  {
1494 1495          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1496          return (p->p_zone->zone_shmmax);
1496 1497  }
1497 1498  
1498 1499  /*ARGSUSED*/
1499 1500  static int
1500 1501  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501 1502      rctl_qty_t incr, uint_t flags)
1502 1503  {
1503 1504          rctl_qty_t v;
1504 1505          ASSERT(MUTEX_HELD(&p->p_lock));
1505 1506          ASSERT(e->rcep_t == RCENTITY_ZONE);
1506 1507          v = e->rcep_p.zone->zone_shmmax + incr;
1507 1508          if (v > rval->rcv_value)
1508 1509                  return (1);
1509 1510          return (0);
1510 1511  }
1511 1512  
1512 1513  static rctl_ops_t zone_shmmax_ops = {
1513 1514          rcop_no_action,
1514 1515          zone_shmmax_usage,
1515 1516          rcop_no_set,
1516 1517          zone_shmmax_test
1517 1518  };
1518 1519  
1519 1520  /*ARGSUSED*/
1520 1521  static rctl_qty_t
1521 1522  zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 1523  {
1523 1524          ASSERT(MUTEX_HELD(&p->p_lock));
1524 1525          return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 1526  }
1526 1527  
1527 1528  /*ARGSUSED*/
1528 1529  static int
1529 1530  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530 1531      rctl_qty_t incr, uint_t flags)
1531 1532  {
1532 1533          rctl_qty_t v;
1533 1534          ASSERT(MUTEX_HELD(&p->p_lock));
1534 1535          ASSERT(e->rcep_t == RCENTITY_ZONE);
1535 1536          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536 1537          if (v > rval->rcv_value)
1537 1538                  return (1);
1538 1539          return (0);
1539 1540  }
1540 1541  
1541 1542  static rctl_ops_t zone_shmmni_ops = {
1542 1543          rcop_no_action,
1543 1544          zone_shmmni_usage,
1544 1545          rcop_no_set,
1545 1546          zone_shmmni_test
1546 1547  };
1547 1548  
1548 1549  /*ARGSUSED*/
1549 1550  static rctl_qty_t
1550 1551  zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 1552  {
1552 1553          ASSERT(MUTEX_HELD(&p->p_lock));
1553 1554          return (p->p_zone->zone_ipc.ipcq_semmni);
1554 1555  }
1555 1556  
1556 1557  /*ARGSUSED*/
1557 1558  static int
1558 1559  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559 1560      rctl_qty_t incr, uint_t flags)
1560 1561  {
1561 1562          rctl_qty_t v;
1562 1563          ASSERT(MUTEX_HELD(&p->p_lock));
1563 1564          ASSERT(e->rcep_t == RCENTITY_ZONE);
1564 1565          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565 1566          if (v > rval->rcv_value)
1566 1567                  return (1);
1567 1568          return (0);
1568 1569  }
1569 1570  
1570 1571  static rctl_ops_t zone_semmni_ops = {
1571 1572          rcop_no_action,
1572 1573          zone_semmni_usage,
1573 1574          rcop_no_set,
1574 1575          zone_semmni_test
1575 1576  };
1576 1577  
1577 1578  /*ARGSUSED*/
1578 1579  static rctl_qty_t
1579 1580  zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 1581  {
1581 1582          ASSERT(MUTEX_HELD(&p->p_lock));
1582 1583          return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 1584  }
1584 1585  
1585 1586  /*ARGSUSED*/
1586 1587  static int
1587 1588  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588 1589      rctl_qty_t incr, uint_t flags)
1589 1590  {
1590 1591          rctl_qty_t v;
1591 1592          ASSERT(MUTEX_HELD(&p->p_lock));
1592 1593          ASSERT(e->rcep_t == RCENTITY_ZONE);
1593 1594          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594 1595          if (v > rval->rcv_value)
1595 1596                  return (1);
1596 1597          return (0);
1597 1598  }
1598 1599  
1599 1600  static rctl_ops_t zone_msgmni_ops = {
1600 1601          rcop_no_action,
1601 1602          zone_msgmni_usage,
1602 1603          rcop_no_set,
1603 1604          zone_msgmni_test
1604 1605  };
1605 1606  
1606 1607  /*ARGSUSED*/
1607 1608  static rctl_qty_t
1608 1609  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 1610  {
1610 1611          rctl_qty_t q;
1611 1612          ASSERT(MUTEX_HELD(&p->p_lock));
1612 1613          mutex_enter(&p->p_zone->zone_mem_lock);
1613 1614          q = p->p_zone->zone_locked_mem;
1614 1615          mutex_exit(&p->p_zone->zone_mem_lock);
1615 1616          return (q);
1616 1617  }
1617 1618  
1618 1619  /*ARGSUSED*/
1619 1620  static int
1620 1621  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621 1622      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 1623  {
1623 1624          rctl_qty_t q;
1624 1625          zone_t *z;
1625 1626  
1626 1627          z = e->rcep_p.zone;
1627 1628          ASSERT(MUTEX_HELD(&p->p_lock));
1628 1629          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629 1630          q = z->zone_locked_mem;
1630 1631          if (q + incr > rcntl->rcv_value)
1631 1632                  return (1);
1632 1633          return (0);
1633 1634  }
1634 1635  
1635 1636  /*ARGSUSED*/
1636 1637  static int
1637 1638  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638 1639      rctl_qty_t nv)
1639 1640  {
1640 1641          ASSERT(MUTEX_HELD(&p->p_lock));
1641 1642          ASSERT(e->rcep_t == RCENTITY_ZONE);
1642 1643          if (e->rcep_p.zone == NULL)
1643 1644                  return (0);
1644 1645          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645 1646          return (0);
1646 1647  }
1647 1648  
1648 1649  static rctl_ops_t zone_locked_mem_ops = {
1649 1650          rcop_no_action,
1650 1651          zone_locked_mem_usage,
1651 1652          zone_locked_mem_set,
1652 1653          zone_locked_mem_test
1653 1654  };
1654 1655  
1655 1656  /*ARGSUSED*/
1656 1657  static rctl_qty_t
1657 1658  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 1659  {
1659 1660          rctl_qty_t q;
1660 1661          zone_t *z = p->p_zone;
1661 1662  
1662 1663          ASSERT(MUTEX_HELD(&p->p_lock));
1663 1664          mutex_enter(&z->zone_mem_lock);
1664 1665          q = z->zone_max_swap;
1665 1666          mutex_exit(&z->zone_mem_lock);
1666 1667          return (q);
1667 1668  }
1668 1669  
1669 1670  /*ARGSUSED*/
1670 1671  static int
1671 1672  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672 1673      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 1674  {
1674 1675          rctl_qty_t q;
1675 1676          zone_t *z;
1676 1677  
1677 1678          z = e->rcep_p.zone;
1678 1679          ASSERT(MUTEX_HELD(&p->p_lock));
1679 1680          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680 1681          q = z->zone_max_swap;
1681 1682          if (q + incr > rcntl->rcv_value)
1682 1683                  return (1);
1683 1684          return (0);
1684 1685  }
1685 1686  
1686 1687  /*ARGSUSED*/
1687 1688  static int
1688 1689  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689 1690      rctl_qty_t nv)
1690 1691  {
1691 1692          ASSERT(MUTEX_HELD(&p->p_lock));
1692 1693          ASSERT(e->rcep_t == RCENTITY_ZONE);
1693 1694          if (e->rcep_p.zone == NULL)
1694 1695                  return (0);
1695 1696          e->rcep_p.zone->zone_max_swap_ctl = nv;
1696 1697          return (0);
1697 1698  }
1698 1699  
1699 1700  static rctl_ops_t zone_max_swap_ops = {
1700 1701          rcop_no_action,
1701 1702          zone_max_swap_usage,
1702 1703          zone_max_swap_set,
1703 1704          zone_max_swap_test
1704 1705  };
1705 1706  
1706 1707  /*ARGSUSED*/
1707 1708  static rctl_qty_t
1708 1709  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 1710  {
1710 1711          rctl_qty_t q;
1711 1712          zone_t *z = p->p_zone;
1712 1713  
1713 1714          ASSERT(MUTEX_HELD(&p->p_lock));
1714 1715          mutex_enter(&z->zone_rctl_lock);
1715 1716          q = z->zone_max_lofi;
1716 1717          mutex_exit(&z->zone_rctl_lock);
1717 1718          return (q);
1718 1719  }
1719 1720  
1720 1721  /*ARGSUSED*/
1721 1722  static int
1722 1723  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723 1724      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 1725  {
1725 1726          rctl_qty_t q;
1726 1727          zone_t *z;
1727 1728  
1728 1729          z = e->rcep_p.zone;
1729 1730          ASSERT(MUTEX_HELD(&p->p_lock));
1730 1731          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731 1732          q = z->zone_max_lofi;
1732 1733          if (q + incr > rcntl->rcv_value)
1733 1734                  return (1);
1734 1735          return (0);
1735 1736  }
1736 1737  
1737 1738  /*ARGSUSED*/
1738 1739  static int
1739 1740  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740 1741      rctl_qty_t nv)
1741 1742  {
1742 1743          ASSERT(MUTEX_HELD(&p->p_lock));
1743 1744          ASSERT(e->rcep_t == RCENTITY_ZONE);
1744 1745          if (e->rcep_p.zone == NULL)
1745 1746                  return (0);
1746 1747          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747 1748          return (0);
1748 1749  }
1749 1750  
1750 1751  static rctl_ops_t zone_max_lofi_ops = {
1751 1752          rcop_no_action,
1752 1753          zone_max_lofi_usage,
1753 1754          zone_max_lofi_set,
1754 1755          zone_max_lofi_test
1755 1756  };
1756 1757  
1757 1758  /*
1758 1759   * Helper function to brand the zone with a unique ID.
1759 1760   */
1760 1761  static void
1761 1762  zone_uniqid(zone_t *zone)
1762 1763  {
1763 1764          static uint64_t uniqid = 0;
1764 1765  
1765 1766          ASSERT(MUTEX_HELD(&zonehash_lock));
1766 1767          zone->zone_uniqid = uniqid++;
1767 1768  }
1768 1769  
1769 1770  /*
1770 1771   * Returns a held pointer to the "kcred" for the specified zone.
1771 1772   */
1772 1773  struct cred *
1773 1774  zone_get_kcred(zoneid_t zoneid)
1774 1775  {
1775 1776          zone_t *zone;
1776 1777          cred_t *cr;
1777 1778  
1778 1779          if ((zone = zone_find_by_id(zoneid)) == NULL)
1779 1780                  return (NULL);
1780 1781          cr = zone->zone_kcred;
1781 1782          crhold(cr);
1782 1783          zone_rele(zone);
1783 1784          return (cr);
1784 1785  }
1785 1786  
1786 1787  static int
1787 1788  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 1789  {
1789 1790          zone_t *zone = ksp->ks_private;
1790 1791          zone_kstat_t *zk = ksp->ks_data;
1791 1792  
1792 1793          if (rw == KSTAT_WRITE)
1793 1794                  return (EACCES);
1794 1795  
1795 1796          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796 1797          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797 1798          return (0);
1798 1799  }
1799 1800  
1800 1801  static int
1801 1802  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 1803  {
1803 1804          zone_t *zone = ksp->ks_private;
1804 1805          zone_kstat_t *zk = ksp->ks_data;
1805 1806  
1806 1807          if (rw == KSTAT_WRITE)
1807 1808                  return (EACCES);
1808 1809  
1809 1810          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810 1811          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811 1812          return (0);
1812 1813  }
1813 1814  
1814 1815  static int
1815 1816  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 1817  {
1817 1818          zone_t *zone = ksp->ks_private;
1818 1819          zone_kstat_t *zk = ksp->ks_data;
1819 1820  
1820 1821          if (rw == KSTAT_WRITE)
1821 1822                  return (EACCES);
1822 1823  
1823 1824          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824 1825          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825 1826          return (0);
1826 1827  }
1827 1828  
1828 1829  static kstat_t *
1829 1830  zone_kstat_create_common(zone_t *zone, char *name,
1830 1831      int (*updatefunc) (kstat_t *, int))
1831 1832  {
1832 1833          kstat_t *ksp;
1833 1834          zone_kstat_t *zk;
1834 1835  
1835 1836          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836 1837              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837 1838              KSTAT_FLAG_VIRTUAL);
1838 1839  
1839 1840          if (ksp == NULL)
1840 1841                  return (NULL);
1841 1842  
1842 1843          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843 1844          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844 1845          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845 1846          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846 1847          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847 1848          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848 1849          ksp->ks_update = updatefunc;
1849 1850          ksp->ks_private = zone;
1850 1851          kstat_install(ksp);
1851 1852          return (ksp);
1852 1853  }
1853 1854  
1854 1855  
1855 1856  static int
1856 1857  zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 1858  {
1858 1859          zone_t *zone = ksp->ks_private;
1859 1860          zone_mcap_kstat_t *zmp = ksp->ks_data;
1860 1861  
1861 1862          if (rw == KSTAT_WRITE)
1862 1863                  return (EACCES);
1863 1864  
1864 1865          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865 1866          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866 1867          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867 1868          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868 1869          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869 1870  
1870 1871          return (0);
1871 1872  }
1872 1873  
1873 1874  static kstat_t *
1874 1875  zone_mcap_kstat_create(zone_t *zone)
1875 1876  {
1876 1877          kstat_t *ksp;
1877 1878          zone_mcap_kstat_t *zmp;
1878 1879  
1879 1880          if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880 1881              zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881 1882              sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882 1883              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883 1884                  return (NULL);
1884 1885  
1885 1886          if (zone->zone_id != GLOBAL_ZONEID)
1886 1887                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1887 1888  
1888 1889          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889 1890          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890 1891          ksp->ks_lock = &zone->zone_mcap_lock;
1891 1892          zone->zone_mcap_stats = zmp;
1892 1893  
1893 1894          /* The kstat "name" field is not large enough for a full zonename */
1894 1895          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895 1896          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896 1897          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897 1898          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898 1899          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899 1900          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900 1901          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901 1902              KSTAT_DATA_UINT64);
1902 1903  
1903 1904          ksp->ks_update = zone_mcap_kstat_update;
1904 1905          ksp->ks_private = zone;
1905 1906  
1906 1907          kstat_install(ksp);
1907 1908          return (ksp);
1908 1909  }
1909 1910  
1910 1911  static int
1911 1912  zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 1913  {
1913 1914          zone_t *zone = ksp->ks_private;
1914 1915          zone_misc_kstat_t *zmp = ksp->ks_data;
1915 1916          hrtime_t hrtime;
1916 1917          uint64_t tmp;
1917 1918  
1918 1919          if (rw == KSTAT_WRITE)
1919 1920                  return (EACCES);
1920 1921  
1921 1922          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
1922 1923          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1923 1924          scalehrtime(&hrtime);
1924 1925          zmp->zm_stime.value.ui64 = hrtime;
1925 1926  
1926 1927          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
1927 1928          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1928 1929          scalehrtime(&hrtime);
1929 1930          zmp->zm_utime.value.ui64 = hrtime;
1930 1931  
1931 1932          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
1932 1933          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1933 1934          scalehrtime(&hrtime);
1934 1935          zmp->zm_wtime.value.ui64 = hrtime;
1935 1936  
1936 1937          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1937 1938          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1938 1939          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1939 1940  
1940 1941          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1941 1942          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1942 1943          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1943 1944          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1944 1945  
1945 1946          zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1946 1947  
1947 1948          zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1948 1949          zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1949 1950  
1950 1951          return (0);
1951 1952  }
1952 1953  
1953 1954  static kstat_t *
1954 1955  zone_misc_kstat_create(zone_t *zone)
1955 1956  {
1956 1957          kstat_t *ksp;
1957 1958          zone_misc_kstat_t *zmp;
1958 1959  
1959 1960          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1960 1961              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1961 1962              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1962 1963              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1963 1964                  return (NULL);
1964 1965  
1965 1966          if (zone->zone_id != GLOBAL_ZONEID)
1966 1967                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1967 1968  
1968 1969          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1969 1970          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1970 1971          ksp->ks_lock = &zone->zone_misc_lock;
1971 1972          zone->zone_misc_stats = zmp;
1972 1973  
1973 1974          /* The kstat "name" field is not large enough for a full zonename */
1974 1975          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1975 1976          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1976 1977          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1977 1978          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1978 1979          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1979 1980          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1980 1981          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1981 1982          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1982 1983              KSTAT_DATA_UINT32);
1983 1984          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1984 1985          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1985 1986              KSTAT_DATA_UINT32);
1986 1987          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1987 1988          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1988 1989          kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1989 1990              KSTAT_DATA_UINT32);
1990 1991          kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1991 1992          kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1992 1993  
1993 1994          ksp->ks_update = zone_misc_kstat_update;
1994 1995          ksp->ks_private = zone;
1995 1996  
1996 1997          kstat_install(ksp);
1997 1998          return (ksp);
1998 1999  }
1999 2000  
2000 2001  static void
2001 2002  zone_kstat_create(zone_t *zone)
2002 2003  {
2003 2004          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2004 2005              "lockedmem", zone_lockedmem_kstat_update);
2005 2006          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2006 2007              "swapresv", zone_swapresv_kstat_update);
2007 2008          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2008 2009              "nprocs", zone_nprocs_kstat_update);
2009 2010  
2010 2011          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2011 2012                  zone->zone_mcap_stats = kmem_zalloc(
2012 2013                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
2013 2014          }
2014 2015  
2015 2016          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2016 2017                  zone->zone_misc_stats = kmem_zalloc(
2017 2018                      sizeof (zone_misc_kstat_t), KM_SLEEP);
2018 2019          }
2019 2020  }
2020 2021  
2021 2022  static void
2022 2023  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2023 2024  {
2024 2025          void *data;
2025 2026  
2026 2027          if (*pkstat != NULL) {
2027 2028                  data = (*pkstat)->ks_data;
2028 2029                  kstat_delete(*pkstat);
2029 2030                  kmem_free(data, datasz);
2030 2031                  *pkstat = NULL;
2031 2032          }
2032 2033  }
2033 2034  
2034 2035  static void
2035 2036  zone_kstat_delete(zone_t *zone)
2036 2037  {
2037 2038          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2038 2039              sizeof (zone_kstat_t));
2039 2040          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2040 2041              sizeof (zone_kstat_t));
2041 2042          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2042 2043              sizeof (zone_kstat_t));
2043 2044          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2044 2045              sizeof (zone_mcap_kstat_t));
2045 2046          zone_kstat_delete_common(&zone->zone_misc_ksp,
2046 2047              sizeof (zone_misc_kstat_t));
2047 2048  }
2048 2049  
2049 2050  /*
2050 2051   * Called very early on in boot to initialize the ZSD list so that
2051 2052   * zone_key_create() can be called before zone_init().  It also initializes
2052 2053   * portions of zone0 which may be used before zone_init() is called.  The
2053 2054   * variable "global_zone" will be set when zone0 is fully initialized by
2054 2055   * zone_init().
2055 2056   */
2056 2057  void
2057 2058  zone_zsd_init(void)
2058 2059  {
2059 2060          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2060 2061          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2061 2062          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2062 2063              offsetof(struct zsd_entry, zsd_linkage));
2063 2064          list_create(&zone_active, sizeof (zone_t),
2064 2065              offsetof(zone_t, zone_linkage));
2065 2066          list_create(&zone_deathrow, sizeof (zone_t),
2066 2067              offsetof(zone_t, zone_linkage));
2067 2068  
2068 2069          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2069 2070          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2070 2071          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2071 2072          zone0.zone_shares = 1;
2072 2073          zone0.zone_nlwps = 0;
2073 2074          zone0.zone_nlwps_ctl = INT_MAX;
2074 2075          zone0.zone_nprocs = 0;
2075 2076          zone0.zone_nprocs_ctl = INT_MAX;
2076 2077          zone0.zone_locked_mem = 0;
2077 2078          zone0.zone_locked_mem_ctl = UINT64_MAX;
2078 2079          ASSERT(zone0.zone_max_swap == 0);
2079 2080          zone0.zone_max_swap_ctl = UINT64_MAX;
2080 2081          zone0.zone_max_lofi = 0;
2081 2082          zone0.zone_max_lofi_ctl = UINT64_MAX;
2082 2083          zone0.zone_shmmax = 0;
2083 2084          zone0.zone_ipc.ipcq_shmmni = 0;
2084 2085          zone0.zone_ipc.ipcq_semmni = 0;
2085 2086          zone0.zone_ipc.ipcq_msgmni = 0;
2086 2087          zone0.zone_name = GLOBAL_ZONENAME;
2087 2088          zone0.zone_nodename = utsname.nodename;
2088 2089          zone0.zone_domain = srpc_domain;
2089 2090          zone0.zone_hostid = HW_INVALID_HOSTID;
2090 2091          zone0.zone_fs_allowed = NULL;
2091 2092          psecflags_default(&zone0.zone_secflags);
2092 2093          zone0.zone_ref = 1;
2093 2094          zone0.zone_id = GLOBAL_ZONEID;
2094 2095          zone0.zone_status = ZONE_IS_RUNNING;
2095 2096          zone0.zone_rootpath = "/";
2096 2097          zone0.zone_rootpathlen = 2;
2097 2098          zone0.zone_psetid = ZONE_PS_INVAL;
2098 2099          zone0.zone_ncpus = 0;
2099 2100          zone0.zone_ncpus_online = 0;
2100 2101          zone0.zone_proc_initpid = 1;
2101 2102          zone0.zone_initname = initname;
2102 2103          zone0.zone_lockedmem_kstat = NULL;
2103 2104          zone0.zone_swapresv_kstat = NULL;
2104 2105          zone0.zone_nprocs_kstat = NULL;
2105 2106  
2106 2107          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2107 2108              offsetof(zone_ref_t, zref_linkage));
2108 2109          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2109 2110              offsetof(struct zsd_entry, zsd_linkage));
2110 2111          list_insert_head(&zone_active, &zone0);
2111 2112  
2112 2113          /*
2113 2114           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2114 2115           * to anything meaningful.  It is assigned to be 'rootdir' in
2115 2116           * vfs_mountroot().
2116 2117           */
2117 2118          zone0.zone_rootvp = NULL;
2118 2119          zone0.zone_vfslist = NULL;
2119 2120          zone0.zone_bootargs = initargs;
2120 2121          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2121 2122          /*
2122 2123           * The global zone has all privileges
2123 2124           */
2124 2125          priv_fillset(zone0.zone_privset);
2125 2126          /*
2126 2127           * Add p0 to the global zone
2127 2128           */
2128 2129          zone0.zone_zsched = &p0;
2129 2130          p0.p_zone = &zone0;
2130 2131  }
2131 2132  
2132 2133  /*
2133 2134   * Compute a hash value based on the contents of the label and the DOI.  The
2134 2135   * hash algorithm is somewhat arbitrary, but is based on the observation that
2135 2136   * humans will likely pick labels that differ by amounts that work out to be
2136 2137   * multiples of the number of hash chains, and thus stirring in some primes
2137 2138   * should help.
2138 2139   */
2139 2140  static uint_t
2140 2141  hash_bylabel(void *hdata, mod_hash_key_t key)
2141 2142  {
2142 2143          const ts_label_t *lab = (ts_label_t *)key;
2143 2144          const uint32_t *up, *ue;
2144 2145          uint_t hash;
2145 2146          int i;
2146 2147  
2147 2148          _NOTE(ARGUNUSED(hdata));
2148 2149  
2149 2150          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2150 2151          /* we depend on alignment of label, but not representation */
2151 2152          up = (const uint32_t *)&lab->tsl_label;
2152 2153          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2153 2154          i = 1;
2154 2155          while (up < ue) {
2155 2156                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2156 2157                  hash += *up + (*up << ((i % 16) + 1));
2157 2158                  up++;
2158 2159                  i++;
2159 2160          }
2160 2161          return (hash);
2161 2162  }
2162 2163  
2163 2164  /*
2164 2165   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2165 2166   * equal).  This may need to be changed if less than / greater than is ever
2166 2167   * needed.
2167 2168   */
2168 2169  static int
2169 2170  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2170 2171  {
2171 2172          ts_label_t *lab1 = (ts_label_t *)key1;
2172 2173          ts_label_t *lab2 = (ts_label_t *)key2;
2173 2174  
2174 2175          return (label_equal(lab1, lab2) ? 0 : 1);
2175 2176  }
2176 2177  
2177 2178  /*
2178 2179   * Called by main() to initialize the zones framework.
2179 2180   */
2180 2181  void
2181 2182  zone_init(void)
2182 2183  {
2183 2184          rctl_dict_entry_t *rde;
2184 2185          rctl_val_t *dval;
2185 2186          rctl_set_t *set;
2186 2187          rctl_alloc_gp_t *gp;
2187 2188          rctl_entity_p_t e;
2188 2189          int res;
2189 2190  
2190 2191          ASSERT(curproc == &p0);
2191 2192  
2192 2193          /*
2193 2194           * Create ID space for zone IDs.  ID 0 is reserved for the
2194 2195           * global zone.
2195 2196           */
2196 2197          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2197 2198  
2198 2199          /*
2199 2200           * Initialize generic zone resource controls, if any.
2200 2201           */
2201 2202          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2202 2203              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2203 2204              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2204 2205              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2205 2206  
2206 2207          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2207 2208              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2208 2209              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2209 2210              RCTL_GLOBAL_INFINITE,
2210 2211              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2211 2212  
2212 2213          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2213 2214              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2214 2215              INT_MAX, INT_MAX, &zone_lwps_ops);
2215 2216  
2216 2217          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2217 2218              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2218 2219              INT_MAX, INT_MAX, &zone_procs_ops);
2219 2220  
2220 2221          /*
2221 2222           * System V IPC resource controls
2222 2223           */
2223 2224          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2224 2225              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2225 2226              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2226 2227  
2227 2228          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2228 2229              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2229 2230              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2230 2231  
2231 2232          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2232 2233              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2233 2234              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2234 2235  
2235 2236          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2236 2237              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2237 2238              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2238 2239  
2239 2240          /*
2240 2241           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2241 2242           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2242 2243           */
2243 2244          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2244 2245          bzero(dval, sizeof (rctl_val_t));
2245 2246          dval->rcv_value = 1;
2246 2247          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2247 2248          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2248 2249          dval->rcv_action_recip_pid = -1;
2249 2250  
2250 2251          rde = rctl_dict_lookup("zone.cpu-shares");
2251 2252          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2252 2253  
2253 2254          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2254 2255              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2255 2256              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2256 2257              &zone_locked_mem_ops);
2257 2258  
2258 2259          rc_zone_max_swap = rctl_register("zone.max-swap",
2259 2260              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2260 2261              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2261 2262              &zone_max_swap_ops);
2262 2263  
2263 2264          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2264 2265              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2265 2266              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2266 2267              &zone_max_lofi_ops);
2267 2268  
2268 2269          /*
2269 2270           * Initialize the ``global zone''.
2270 2271           */
2271 2272          set = rctl_set_create();
2272 2273          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2273 2274          mutex_enter(&p0.p_lock);
2274 2275          e.rcep_p.zone = &zone0;
2275 2276          e.rcep_t = RCENTITY_ZONE;
2276 2277          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2277 2278              gp);
2278 2279  
2279 2280          zone0.zone_nlwps = p0.p_lwpcnt;
2280 2281          zone0.zone_nprocs = 1;
2281 2282          zone0.zone_ntasks = 1;
2282 2283          mutex_exit(&p0.p_lock);
2283 2284          zone0.zone_restart_init = B_TRUE;
2284 2285          zone0.zone_brand = &native_brand;
2285 2286          rctl_prealloc_destroy(gp);
2286 2287          /*
2287 2288           * pool_default hasn't been initialized yet, so we let pool_init()
2288 2289           * take care of making sure the global zone is in the default pool.
2289 2290           */
2290 2291  
2291 2292          /*
2292 2293           * Initialize global zone kstats
2293 2294           */
2294 2295          zone_kstat_create(&zone0);
2295 2296  
2296 2297          /*
2297 2298           * Initialize zone label.
2298 2299           * mlp are initialized when tnzonecfg is loaded.
2299 2300           */
2300 2301          zone0.zone_slabel = l_admin_low;
2301 2302          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2302 2303          label_hold(l_admin_low);
2303 2304  
2304 2305          /*
2305 2306           * Initialise the lock for the database structure used by mntfs.
2306 2307           */
2307 2308          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2308 2309  
2309 2310          zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2310 2311  
2311 2312          mutex_enter(&zonehash_lock);
2312 2313          zone_uniqid(&zone0);
2313 2314          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2314 2315  
2315 2316          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2316 2317              mod_hash_null_valdtor);
2317 2318          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2318 2319              zone_hash_size, mod_hash_null_valdtor);
2319 2320          /*
2320 2321           * maintain zonehashbylabel only for labeled systems
2321 2322           */
2322 2323          if (is_system_labeled())
2323 2324                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2324 2325                      zone_hash_size, mod_hash_null_keydtor,
2325 2326                      mod_hash_null_valdtor, hash_bylabel, NULL,
2326 2327                      hash_labelkey_cmp, KM_SLEEP);
2327 2328          zonecount = 1;
2328 2329  
2329 2330          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2330 2331              (mod_hash_val_t)&zone0);
2331 2332          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2332 2333              (mod_hash_val_t)&zone0);
2333 2334          if (is_system_labeled()) {
2334 2335                  zone0.zone_flags |= ZF_HASHED_LABEL;
2335 2336                  (void) mod_hash_insert(zonehashbylabel,
2336 2337                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2337 2338          }
2338 2339          mutex_exit(&zonehash_lock);
2339 2340  
2340 2341          /*
2341 2342           * We avoid setting zone_kcred until now, since kcred is initialized
2342 2343           * sometime after zone_zsd_init() and before zone_init().
2343 2344           */
2344 2345          zone0.zone_kcred = kcred;
2345 2346          /*
2346 2347           * The global zone is fully initialized (except for zone_rootvp which
2347 2348           * will be set when the root filesystem is mounted).
2348 2349           */
2349 2350          global_zone = &zone0;
2350 2351  
2351 2352          /*
2352 2353           * Setup an event channel to send zone status change notifications on
2353 2354           */
2354 2355          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2355 2356              EVCH_CREAT);
2356 2357  
2357 2358          if (res)
2358 2359                  panic("Sysevent_evc_bind failed during zone setup.\n");
2359 2360  
2360 2361  }
2361 2362  
2362 2363  static void
2363 2364  zone_free(zone_t *zone)
2364 2365  {
2365 2366          ASSERT(zone != global_zone);
2366 2367          ASSERT(zone->zone_ntasks == 0);
2367 2368          ASSERT(zone->zone_nlwps == 0);
2368 2369          ASSERT(zone->zone_nprocs == 0);
2369 2370          ASSERT(zone->zone_cred_ref == 0);
2370 2371          ASSERT(zone->zone_kcred == NULL);
2371 2372          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2372 2373              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2373 2374          ASSERT(list_is_empty(&zone->zone_ref_list));
2374 2375  
2375 2376          /*
2376 2377           * Remove any zone caps.
2377 2378           */
2378 2379          cpucaps_zone_remove(zone);
2379 2380  
2380 2381          ASSERT(zone->zone_cpucap == NULL);
2381 2382  
2382 2383          /* remove from deathrow list */
2383 2384          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2384 2385                  ASSERT(zone->zone_ref == 0);
2385 2386                  mutex_enter(&zone_deathrow_lock);
2386 2387                  list_remove(&zone_deathrow, zone);
2387 2388                  mutex_exit(&zone_deathrow_lock);
2388 2389          }
2389 2390  
2390 2391          list_destroy(&zone->zone_ref_list);
2391 2392          zone_free_zsd(zone);
2392 2393          zone_free_datasets(zone);
2393 2394          list_destroy(&zone->zone_dl_list);
2394 2395  
2395 2396          cpu_uarray_free(zone->zone_ustate);
2396 2397  
2397 2398          if (zone->zone_rootvp != NULL)
2398 2399                  VN_RELE(zone->zone_rootvp);
2399 2400          if (zone->zone_rootpath)
2400 2401                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2401 2402          if (zone->zone_name != NULL)
2402 2403                  kmem_free(zone->zone_name, ZONENAME_MAX);
2403 2404          if (zone->zone_slabel != NULL)
2404 2405                  label_rele(zone->zone_slabel);
2405 2406          if (zone->zone_nodename != NULL)
2406 2407                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2407 2408          if (zone->zone_domain != NULL)
2408 2409                  kmem_free(zone->zone_domain, _SYS_NMLN);
2409 2410          if (zone->zone_privset != NULL)
2410 2411                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2411 2412          if (zone->zone_rctls != NULL)
2412 2413                  rctl_set_free(zone->zone_rctls);
2413 2414          if (zone->zone_bootargs != NULL)
2414 2415                  strfree(zone->zone_bootargs);
2415 2416          if (zone->zone_initname != NULL)
2416 2417                  strfree(zone->zone_initname);
2417 2418          if (zone->zone_fs_allowed != NULL)
2418 2419                  strfree(zone->zone_fs_allowed);
2419 2420          if (zone->zone_pfexecd != NULL)
2420 2421                  klpd_freelist(&zone->zone_pfexecd);
2421 2422          id_free(zoneid_space, zone->zone_id);
2422 2423          mutex_destroy(&zone->zone_lock);
2423 2424          cv_destroy(&zone->zone_cv);
2424 2425          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2425 2426          rw_destroy(&zone->zone_mntfs_db_lock);
2426 2427          kmem_free(zone, sizeof (zone_t));
2427 2428  }
2428 2429  
2429 2430  /*
2430 2431   * See block comment at the top of this file for information about zone
2431 2432   * status values.
2432 2433   */
2433 2434  /*
2434 2435   * Convenience function for setting zone status.
2435 2436   */
2436 2437  static void
2437 2438  zone_status_set(zone_t *zone, zone_status_t status)
2438 2439  {
2439 2440  
2440 2441          nvlist_t *nvl = NULL;
2441 2442          ASSERT(MUTEX_HELD(&zone_status_lock));
2442 2443          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2443 2444              status >= zone_status_get(zone));
2444 2445  
2445 2446          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2446 2447              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2447 2448              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2448 2449              zone_status_table[status]) ||
2449 2450              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2450 2451              zone_status_table[zone->zone_status]) ||
2451 2452              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2452 2453              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2453 2454              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2454 2455              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2455 2456  #ifdef DEBUG
2456 2457                  (void) printf(
2457 2458                      "Failed to allocate and send zone state change event.\n");
2458 2459  #endif
2459 2460          }
2460 2461          nvlist_free(nvl);
2461 2462  
2462 2463          zone->zone_status = status;
2463 2464  
2464 2465          cv_broadcast(&zone->zone_cv);
2465 2466  }
2466 2467  
2467 2468  /*
2468 2469   * Public function to retrieve the zone status.  The zone status may
2469 2470   * change after it is retrieved.
2470 2471   */
2471 2472  zone_status_t
2472 2473  zone_status_get(zone_t *zone)
2473 2474  {
2474 2475          return (zone->zone_status);
2475 2476  }
2476 2477  
2477 2478  static int
2478 2479  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2479 2480  {
2480 2481          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2481 2482          int err = 0;
2482 2483  
2483 2484          ASSERT(zone != global_zone);
2484 2485          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2485 2486                  goto done;      /* EFAULT or ENAMETOOLONG */
2486 2487  
2487 2488          if (zone->zone_bootargs != NULL)
2488 2489                  strfree(zone->zone_bootargs);
2489 2490  
2490 2491          zone->zone_bootargs = strdup(buf);
2491 2492  
2492 2493  done:
2493 2494          kmem_free(buf, BOOTARGS_MAX);
2494 2495          return (err);
2495 2496  }
2496 2497  
2497 2498  static int
2498 2499  zone_set_brand(zone_t *zone, const char *brand)
2499 2500  {
2500 2501          struct brand_attr *attrp;
2501 2502          brand_t *bp;
2502 2503  
2503 2504          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2504 2505          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2505 2506                  kmem_free(attrp, sizeof (struct brand_attr));
2506 2507                  return (EFAULT);
2507 2508          }
2508 2509  
2509 2510          bp = brand_register_zone(attrp);
2510 2511          kmem_free(attrp, sizeof (struct brand_attr));
2511 2512          if (bp == NULL)
2512 2513                  return (EINVAL);
2513 2514  
2514 2515          /*
2515 2516           * This is the only place where a zone can change it's brand.
2516 2517           * We already need to hold zone_status_lock to check the zone
2517 2518           * status, so we'll just use that lock to serialize zone
2518 2519           * branding requests as well.
2519 2520           */
2520 2521          mutex_enter(&zone_status_lock);
2521 2522  
2522 2523          /* Re-Branding is not allowed and the zone can't be booted yet */
2523 2524          if ((ZONE_IS_BRANDED(zone)) ||
2524 2525              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2525 2526                  mutex_exit(&zone_status_lock);
2526 2527                  brand_unregister_zone(bp);
2527 2528                  return (EINVAL);
2528 2529          }
2529 2530  
2530 2531          /* set up the brand specific data */
2531 2532          zone->zone_brand = bp;
2532 2533          ZBROP(zone)->b_init_brand_data(zone);
2533 2534  
2534 2535          mutex_exit(&zone_status_lock);
2535 2536          return (0);
2536 2537  }
2537 2538  
2538 2539  static int
2539 2540  zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2540 2541  {
2541 2542          int err = 0;
2542 2543          psecflags_t psf;
2543 2544  
2544 2545          ASSERT(zone != global_zone);
2545 2546  
2546 2547          if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2547 2548                  return (err);
2548 2549  
2549 2550          if (zone_status_get(zone) > ZONE_IS_READY)
2550 2551                  return (EINVAL);
2551 2552  
2552 2553          if (!psecflags_validate(&psf))
2553 2554                  return (EINVAL);
2554 2555  
2555 2556          (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2556 2557  
2557 2558          /* Set security flags on the zone's zsched */
2558 2559          (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2559 2560              sizeof (zone->zone_zsched->p_secflags));
2560 2561  
2561 2562          return (0);
2562 2563  }
2563 2564  
2564 2565  static int
2565 2566  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2566 2567  {
2567 2568          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2568 2569          int err = 0;
2569 2570  
2570 2571          ASSERT(zone != global_zone);
2571 2572          if ((err = copyinstr(zone_fs_allowed, buf,
2572 2573              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2573 2574                  goto done;
2574 2575  
2575 2576          if (zone->zone_fs_allowed != NULL)
2576 2577                  strfree(zone->zone_fs_allowed);
2577 2578  
2578 2579          zone->zone_fs_allowed = strdup(buf);
2579 2580  
2580 2581  done:
2581 2582          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2582 2583          return (err);
2583 2584  }
2584 2585  
2585 2586  static int
2586 2587  zone_set_initname(zone_t *zone, const char *zone_initname)
2587 2588  {
2588 2589          char initname[INITNAME_SZ];
2589 2590          size_t len;
2590 2591          int err = 0;
2591 2592  
2592 2593          ASSERT(zone != global_zone);
2593 2594          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2594 2595                  return (err);   /* EFAULT or ENAMETOOLONG */
2595 2596  
2596 2597          if (zone->zone_initname != NULL)
2597 2598                  strfree(zone->zone_initname);
2598 2599  
2599 2600          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2600 2601          (void) strcpy(zone->zone_initname, initname);
2601 2602          return (0);
2602 2603  }
2603 2604  
2604 2605  static int
2605 2606  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2606 2607  {
2607 2608          uint64_t mcap;
2608 2609          int err = 0;
2609 2610  
2610 2611          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2611 2612                  zone->zone_phys_mcap = mcap;
2612 2613  
2613 2614          return (err);
2614 2615  }
2615 2616  
2616 2617  static int
2617 2618  zone_set_sched_class(zone_t *zone, const char *new_class)
2618 2619  {
2619 2620          char sched_class[PC_CLNMSZ];
2620 2621          id_t classid;
2621 2622          int err;
2622 2623  
2623 2624          ASSERT(zone != global_zone);
2624 2625          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2625 2626                  return (err);   /* EFAULT or ENAMETOOLONG */
2626 2627  
2627 2628          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2628 2629                  return (set_errno(EINVAL));
2629 2630          zone->zone_defaultcid = classid;
2630 2631          ASSERT(zone->zone_defaultcid > 0 &&
2631 2632              zone->zone_defaultcid < loaded_classes);
2632 2633  
2633 2634          return (0);
2634 2635  }
2635 2636  
2636 2637  /*
2637 2638   * Block indefinitely waiting for (zone_status >= status)
2638 2639   */
2639 2640  void
2640 2641  zone_status_wait(zone_t *zone, zone_status_t status)
2641 2642  {
2642 2643          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2643 2644  
2644 2645          mutex_enter(&zone_status_lock);
2645 2646          while (zone->zone_status < status) {
2646 2647                  cv_wait(&zone->zone_cv, &zone_status_lock);
2647 2648          }
2648 2649          mutex_exit(&zone_status_lock);
2649 2650  }
2650 2651  
2651 2652  /*
2652 2653   * Private CPR-safe version of zone_status_wait().
2653 2654   */
2654 2655  static void
2655 2656  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2656 2657  {
2657 2658          callb_cpr_t cprinfo;
2658 2659  
2659 2660          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2660 2661  
2661 2662          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2662 2663              str);
2663 2664          mutex_enter(&zone_status_lock);
2664 2665          while (zone->zone_status < status) {
2665 2666                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2666 2667                  cv_wait(&zone->zone_cv, &zone_status_lock);
2667 2668                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2668 2669          }
2669 2670          /*
2670 2671           * zone_status_lock is implicitly released by the following.
2671 2672           */
2672 2673          CALLB_CPR_EXIT(&cprinfo);
2673 2674  }
2674 2675  
2675 2676  /*
2676 2677   * Block until zone enters requested state or signal is received.  Return (0)
2677 2678   * if signaled, non-zero otherwise.
2678 2679   */
2679 2680  int
2680 2681  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2681 2682  {
2682 2683          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2683 2684  
2684 2685          mutex_enter(&zone_status_lock);
2685 2686          while (zone->zone_status < status) {
2686 2687                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2687 2688                          mutex_exit(&zone_status_lock);
2688 2689                          return (0);
2689 2690                  }
2690 2691          }
2691 2692          mutex_exit(&zone_status_lock);
2692 2693          return (1);
2693 2694  }
2694 2695  
2695 2696  /*
2696 2697   * Block until the zone enters the requested state or the timeout expires,
2697 2698   * whichever happens first.  Return (-1) if operation timed out, time remaining
2698 2699   * otherwise.
2699 2700   */
2700 2701  clock_t
2701 2702  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2702 2703  {
2703 2704          clock_t timeleft = 0;
2704 2705  
2705 2706          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2706 2707  
2707 2708          mutex_enter(&zone_status_lock);
2708 2709          while (zone->zone_status < status && timeleft != -1) {
2709 2710                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2710 2711          }
2711 2712          mutex_exit(&zone_status_lock);
2712 2713          return (timeleft);
2713 2714  }
2714 2715  
2715 2716  /*
2716 2717   * Block until the zone enters the requested state, the current process is
2717 2718   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2718 2719   * operation timed out, 0 if signaled, time remaining otherwise.
2719 2720   */
2720 2721  clock_t
2721 2722  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2722 2723  {
2723 2724          clock_t timeleft = tim - ddi_get_lbolt();
2724 2725  
2725 2726          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2726 2727  
2727 2728          mutex_enter(&zone_status_lock);
2728 2729          while (zone->zone_status < status) {
2729 2730                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2730 2731                      tim);
2731 2732                  if (timeleft <= 0)
2732 2733                          break;
2733 2734          }
2734 2735          mutex_exit(&zone_status_lock);
2735 2736          return (timeleft);
2736 2737  }
2737 2738  
2738 2739  /*
2739 2740   * Zones have two reference counts: one for references from credential
2740 2741   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2741 2742   * This is so we can allow a zone to be rebooted while there are still
2742 2743   * outstanding cred references, since certain drivers cache dblks (which
2743 2744   * implicitly results in cached creds).  We wait for zone_ref to drop to
2744 2745   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2745 2746   * later freed when the zone_cred_ref drops to 0, though nothing other
2746 2747   * than the zone id and privilege set should be accessed once the zone
2747 2748   * is "dead".
2748 2749   *
2749 2750   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2750 2751   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2751 2752   * to 0.  This can be useful to flush out other sources of cached creds
2752 2753   * that may be less innocuous than the driver case.
2753 2754   *
2754 2755   * Zones also provide a tracked reference counting mechanism in which zone
2755 2756   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2756 2757   * debuggers determine the sources of leaked zone references.  See
2757 2758   * zone_hold_ref() and zone_rele_ref() below for more information.
2758 2759   */
2759 2760  
2760 2761  int zone_wait_for_cred = 0;
2761 2762  
2762 2763  static void
2763 2764  zone_hold_locked(zone_t *z)
2764 2765  {
2765 2766          ASSERT(MUTEX_HELD(&z->zone_lock));
2766 2767          z->zone_ref++;
2767 2768          ASSERT(z->zone_ref != 0);
2768 2769  }
2769 2770  
2770 2771  /*
2771 2772   * Increment the specified zone's reference count.  The zone's zone_t structure
2772 2773   * will not be freed as long as the zone's reference count is nonzero.
2773 2774   * Decrement the zone's reference count via zone_rele().
2774 2775   *
2775 2776   * NOTE: This function should only be used to hold zones for short periods of
2776 2777   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2777 2778   */
2778 2779  void
2779 2780  zone_hold(zone_t *z)
2780 2781  {
2781 2782          mutex_enter(&z->zone_lock);
2782 2783          zone_hold_locked(z);
2783 2784          mutex_exit(&z->zone_lock);
2784 2785  }
2785 2786  
2786 2787  /*
2787 2788   * If the non-cred ref count drops to 1 and either the cred ref count
2788 2789   * is 0 or we aren't waiting for cred references, the zone is ready to
2789 2790   * be destroyed.
2790 2791   */
2791 2792  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2792 2793              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2793 2794  
2794 2795  /*
2795 2796   * Common zone reference release function invoked by zone_rele() and
2796 2797   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2797 2798   * zone's subsystem-specific reference counters are not affected by the
2798 2799   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2799 2800   * removed from the specified zone's reference list.  ref must be non-NULL iff
2800 2801   * subsys is not ZONE_REF_NUM_SUBSYS.
2801 2802   */
2802 2803  static void
2803 2804  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2804 2805  {
2805 2806          boolean_t wakeup;
2806 2807  
2807 2808          mutex_enter(&z->zone_lock);
2808 2809          ASSERT(z->zone_ref != 0);
2809 2810          z->zone_ref--;
2810 2811          if (subsys != ZONE_REF_NUM_SUBSYS) {
2811 2812                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2812 2813                  z->zone_subsys_ref[subsys]--;
2813 2814                  list_remove(&z->zone_ref_list, ref);
2814 2815          }
2815 2816          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2816 2817                  /* no more refs, free the structure */
2817 2818                  mutex_exit(&z->zone_lock);
2818 2819                  zone_free(z);
2819 2820                  return;
2820 2821          }
2821 2822          /* signal zone_destroy so the zone can finish halting */
2822 2823          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2823 2824          mutex_exit(&z->zone_lock);
2824 2825  
2825 2826          if (wakeup) {
2826 2827                  /*
2827 2828                   * Grabbing zonehash_lock here effectively synchronizes with
2828 2829                   * zone_destroy() to avoid missed signals.
2829 2830                   */
2830 2831                  mutex_enter(&zonehash_lock);
2831 2832                  cv_broadcast(&zone_destroy_cv);
2832 2833                  mutex_exit(&zonehash_lock);
2833 2834          }
2834 2835  }
2835 2836  
2836 2837  /*
2837 2838   * Decrement the specified zone's reference count.  The specified zone will
2838 2839   * cease to exist after this function returns if the reference count drops to
2839 2840   * zero.  This function should be paired with zone_hold().
2840 2841   */
2841 2842  void
2842 2843  zone_rele(zone_t *z)
2843 2844  {
2844 2845          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2845 2846  }
2846 2847  
2847 2848  /*
2848 2849   * Initialize a zone reference structure.  This function must be invoked for
2849 2850   * a reference structure before the structure is passed to zone_hold_ref().
2850 2851   */
2851 2852  void
2852 2853  zone_init_ref(zone_ref_t *ref)
2853 2854  {
2854 2855          ref->zref_zone = NULL;
2855 2856          list_link_init(&ref->zref_linkage);
2856 2857  }
2857 2858  
2858 2859  /*
2859 2860   * Acquire a reference to zone z.  The caller must specify the
2860 2861   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2861 2862   * zone_ref_t structure will represent a reference to the specified zone.  Use
2862 2863   * zone_rele_ref() to release the reference.
2863 2864   *
2864 2865   * The referenced zone_t structure will not be freed as long as the zone_t's
2865 2866   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2866 2867   * references.
2867 2868   *
2868 2869   * NOTE: The zone_ref_t structure must be initialized before it is used.
2869 2870   * See zone_init_ref() above.
2870 2871   */
2871 2872  void
2872 2873  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2873 2874  {
2874 2875          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2875 2876  
2876 2877          /*
2877 2878           * Prevent consumers from reusing a reference structure before
2878 2879           * releasing it.
2879 2880           */
2880 2881          VERIFY(ref->zref_zone == NULL);
2881 2882  
2882 2883          ref->zref_zone = z;
2883 2884          mutex_enter(&z->zone_lock);
2884 2885          zone_hold_locked(z);
2885 2886          z->zone_subsys_ref[subsys]++;
2886 2887          ASSERT(z->zone_subsys_ref[subsys] != 0);
2887 2888          list_insert_head(&z->zone_ref_list, ref);
2888 2889          mutex_exit(&z->zone_lock);
2889 2890  }
2890 2891  
2891 2892  /*
2892 2893   * Release the zone reference represented by the specified zone_ref_t.
2893 2894   * The reference is invalid after it's released; however, the zone_ref_t
2894 2895   * structure can be reused without having to invoke zone_init_ref().
2895 2896   * subsys should be the same value that was passed to zone_hold_ref()
2896 2897   * when the reference was acquired.
2897 2898   */
2898 2899  void
2899 2900  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2900 2901  {
2901 2902          zone_rele_common(ref->zref_zone, ref, subsys);
2902 2903  
2903 2904          /*
2904 2905           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2905 2906           * when consumers dereference the reference.  This helps us catch
2906 2907           * consumers who use released references.  Furthermore, this lets
2907 2908           * consumers reuse the zone_ref_t structure without having to
2908 2909           * invoke zone_init_ref().
2909 2910           */
2910 2911          ref->zref_zone = NULL;
2911 2912  }
2912 2913  
2913 2914  void
2914 2915  zone_cred_hold(zone_t *z)
2915 2916  {
2916 2917          mutex_enter(&z->zone_lock);
2917 2918          z->zone_cred_ref++;
2918 2919          ASSERT(z->zone_cred_ref != 0);
2919 2920          mutex_exit(&z->zone_lock);
2920 2921  }
2921 2922  
2922 2923  void
2923 2924  zone_cred_rele(zone_t *z)
2924 2925  {
2925 2926          boolean_t wakeup;
2926 2927  
2927 2928          mutex_enter(&z->zone_lock);
2928 2929          ASSERT(z->zone_cred_ref != 0);
2929 2930          z->zone_cred_ref--;
2930 2931          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2931 2932                  /* no more refs, free the structure */
2932 2933                  mutex_exit(&z->zone_lock);
2933 2934                  zone_free(z);
2934 2935                  return;
2935 2936          }
2936 2937          /*
2937 2938           * If zone_destroy is waiting for the cred references to drain
2938 2939           * out, and they have, signal it.
2939 2940           */
2940 2941          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2941 2942              zone_status_get(z) >= ZONE_IS_DEAD);
2942 2943          mutex_exit(&z->zone_lock);
2943 2944  
2944 2945          if (wakeup) {
2945 2946                  /*
2946 2947                   * Grabbing zonehash_lock here effectively synchronizes with
2947 2948                   * zone_destroy() to avoid missed signals.
2948 2949                   */
2949 2950                  mutex_enter(&zonehash_lock);
2950 2951                  cv_broadcast(&zone_destroy_cv);
2951 2952                  mutex_exit(&zonehash_lock);
2952 2953          }
2953 2954  }
2954 2955  
2955 2956  void
2956 2957  zone_task_hold(zone_t *z)
2957 2958  {
2958 2959          mutex_enter(&z->zone_lock);
2959 2960          z->zone_ntasks++;
2960 2961          ASSERT(z->zone_ntasks != 0);
2961 2962          mutex_exit(&z->zone_lock);
2962 2963  }
2963 2964  
2964 2965  void
2965 2966  zone_task_rele(zone_t *zone)
2966 2967  {
2967 2968          uint_t refcnt;
2968 2969  
2969 2970          mutex_enter(&zone->zone_lock);
2970 2971          ASSERT(zone->zone_ntasks != 0);
2971 2972          refcnt = --zone->zone_ntasks;
2972 2973          if (refcnt > 1) {       /* Common case */
2973 2974                  mutex_exit(&zone->zone_lock);
2974 2975                  return;
2975 2976          }
2976 2977          zone_hold_locked(zone); /* so we can use the zone_t later */
2977 2978          mutex_exit(&zone->zone_lock);
2978 2979          if (refcnt == 1) {
2979 2980                  /*
2980 2981                   * See if the zone is shutting down.
2981 2982                   */
2982 2983                  mutex_enter(&zone_status_lock);
2983 2984                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2984 2985                          goto out;
2985 2986                  }
2986 2987  
2987 2988                  /*
2988 2989                   * Make sure the ntasks didn't change since we
2989 2990                   * dropped zone_lock.
2990 2991                   */
2991 2992                  mutex_enter(&zone->zone_lock);
2992 2993                  if (refcnt != zone->zone_ntasks) {
2993 2994                          mutex_exit(&zone->zone_lock);
2994 2995                          goto out;
2995 2996                  }
2996 2997                  mutex_exit(&zone->zone_lock);
2997 2998  
2998 2999                  /*
2999 3000                   * No more user processes in the zone.  The zone is empty.
3000 3001                   */
3001 3002                  zone_status_set(zone, ZONE_IS_EMPTY);
3002 3003                  goto out;
3003 3004          }
3004 3005  
3005 3006          ASSERT(refcnt == 0);
3006 3007          /*
3007 3008           * zsched has exited; the zone is dead.
3008 3009           */
3009 3010          zone->zone_zsched = NULL;               /* paranoia */
3010 3011          mutex_enter(&zone_status_lock);
3011 3012          zone_status_set(zone, ZONE_IS_DEAD);
3012 3013  out:

↓ open down ↓

2579 lines elided

↑ open up ↑

3013 3014          mutex_exit(&zone_status_lock);
3014 3015          zone_rele(zone);
3015 3016  }
3016 3017  
3017 3018  zoneid_t
3018 3019  getzoneid(void)
3019 3020  {
3020 3021          return (curproc->p_zone->zone_id);
3021 3022  }
3022 3023  
     3024 +zoneid_t
     3025 +getzonedid(void)
     3026 +{
     3027 +        return (curproc->p_zone->zone_did);
     3028 +}
     3029 +
3023 3030  /*
3024 3031   * Internal versions of zone_find_by_*().  These don't zone_hold() or
3025 3032   * check the validity of a zone's state.
3026 3033   */
3027 3034  static zone_t *
3028 3035  zone_find_all_by_id(zoneid_t zoneid)
3029 3036  {
3030 3037          mod_hash_val_t hv;
3031 3038          zone_t *zone = NULL;
3032 3039

3033 3040          ASSERT(MUTEX_HELD(&zonehash_lock));
3034 3041  
3035 3042          if (mod_hash_find(zonehashbyid,
3036 3043              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3037 3044                  zone = (zone_t *)hv;
3038 3045          return (zone);
3039 3046  }
3040 3047  
3041 3048  static zone_t *
3042 3049  zone_find_all_by_label(const ts_label_t *label)
3043 3050  {
3044 3051          mod_hash_val_t hv;
3045 3052          zone_t *zone = NULL;
3046 3053  
3047 3054          ASSERT(MUTEX_HELD(&zonehash_lock));
3048 3055  
3049 3056          /*
3050 3057           * zonehashbylabel is not maintained for unlabeled systems
3051 3058           */
3052 3059          if (!is_system_labeled())
3053 3060                  return (NULL);
3054 3061          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3055 3062                  zone = (zone_t *)hv;
3056 3063          return (zone);
3057 3064  }
3058 3065  
3059 3066  static zone_t *
3060 3067  zone_find_all_by_name(char *name)
3061 3068  {
3062 3069          mod_hash_val_t hv;
3063 3070          zone_t *zone = NULL;
3064 3071  
3065 3072          ASSERT(MUTEX_HELD(&zonehash_lock));
3066 3073  
3067 3074          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3068 3075                  zone = (zone_t *)hv;
3069 3076          return (zone);
3070 3077  }
3071 3078  
3072 3079  /*
3073 3080   * Public interface for looking up a zone by zoneid.  Only returns the zone if
3074 3081   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3075 3082   * Caller must call zone_rele() once it is done with the zone.
3076 3083   *
3077 3084   * The zone may begin the zone_destroy() sequence immediately after this
3078 3085   * function returns, but may be safely used until zone_rele() is called.
3079 3086   */
3080 3087  zone_t *
3081 3088  zone_find_by_id(zoneid_t zoneid)
3082 3089  {
3083 3090          zone_t *zone;
3084 3091          zone_status_t status;
3085 3092  
3086 3093          mutex_enter(&zonehash_lock);
3087 3094          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3088 3095                  mutex_exit(&zonehash_lock);
3089 3096                  return (NULL);
3090 3097          }
3091 3098          status = zone_status_get(zone);
3092 3099          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3093 3100                  /*
3094 3101                   * For all practical purposes the zone doesn't exist.
3095 3102                   */
3096 3103                  mutex_exit(&zonehash_lock);
3097 3104                  return (NULL);
3098 3105          }
3099 3106          zone_hold(zone);
3100 3107          mutex_exit(&zonehash_lock);
3101 3108          return (zone);
3102 3109  }
3103 3110  
3104 3111  /*
3105 3112   * Similar to zone_find_by_id, but using zone label as the key.
3106 3113   */
3107 3114  zone_t *
3108 3115  zone_find_by_label(const ts_label_t *label)
3109 3116  {
3110 3117          zone_t *zone;
3111 3118          zone_status_t status;
3112 3119  
3113 3120          mutex_enter(&zonehash_lock);
3114 3121          if ((zone = zone_find_all_by_label(label)) == NULL) {
3115 3122                  mutex_exit(&zonehash_lock);
3116 3123                  return (NULL);
3117 3124          }
3118 3125  
3119 3126          status = zone_status_get(zone);
3120 3127          if (status > ZONE_IS_DOWN) {
3121 3128                  /*
3122 3129                   * For all practical purposes the zone doesn't exist.
3123 3130                   */
3124 3131                  mutex_exit(&zonehash_lock);
3125 3132                  return (NULL);
3126 3133          }
3127 3134          zone_hold(zone);
3128 3135          mutex_exit(&zonehash_lock);
3129 3136          return (zone);
3130 3137  }
3131 3138  
3132 3139  /*
3133 3140   * Similar to zone_find_by_id, but using zone name as the key.
3134 3141   */
3135 3142  zone_t *
3136 3143  zone_find_by_name(char *name)
3137 3144  {
3138 3145          zone_t *zone;
3139 3146          zone_status_t status;
3140 3147  
3141 3148          mutex_enter(&zonehash_lock);
3142 3149          if ((zone = zone_find_all_by_name(name)) == NULL) {
3143 3150                  mutex_exit(&zonehash_lock);
3144 3151                  return (NULL);
3145 3152          }
3146 3153          status = zone_status_get(zone);
3147 3154          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3148 3155                  /*
3149 3156                   * For all practical purposes the zone doesn't exist.
3150 3157                   */
3151 3158                  mutex_exit(&zonehash_lock);
3152 3159                  return (NULL);
3153 3160          }
3154 3161          zone_hold(zone);
3155 3162          mutex_exit(&zonehash_lock);
3156 3163          return (zone);
3157 3164  }
3158 3165  
3159 3166  /*
3160 3167   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3161 3168   * if there is a zone "foo" rooted at /foo/root, and the path argument
3162 3169   * is "/foo/root/proc", it will return the held zone_t corresponding to
3163 3170   * zone "foo".
3164 3171   *
3165 3172   * zone_find_by_path() always returns a non-NULL value, since at the
3166 3173   * very least every path will be contained in the global zone.
3167 3174   *
3168 3175   * As with the other zone_find_by_*() functions, the caller is
3169 3176   * responsible for zone_rele()ing the return value of this function.
3170 3177   */
3171 3178  zone_t *
3172 3179  zone_find_by_path(const char *path)
3173 3180  {
3174 3181          zone_t *zone;
3175 3182          zone_t *zret = NULL;
3176 3183          zone_status_t status;
3177 3184  
3178 3185          if (path == NULL) {
3179 3186                  /*
3180 3187                   * Call from rootconf().
3181 3188                   */
3182 3189                  zone_hold(global_zone);
3183 3190                  return (global_zone);
3184 3191          }
3185 3192          ASSERT(*path == '/');
3186 3193          mutex_enter(&zonehash_lock);
3187 3194          for (zone = list_head(&zone_active); zone != NULL;
3188 3195              zone = list_next(&zone_active, zone)) {
3189 3196                  if (ZONE_PATH_VISIBLE(path, zone))
3190 3197                          zret = zone;
3191 3198          }
3192 3199          ASSERT(zret != NULL);
3193 3200          status = zone_status_get(zret);
3194 3201          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3195 3202                  /*
3196 3203                   * Zone practically doesn't exist.
3197 3204                   */
3198 3205                  zret = global_zone;
3199 3206          }
3200 3207          zone_hold(zret);
3201 3208          mutex_exit(&zonehash_lock);
3202 3209          return (zret);
3203 3210  }
3204 3211  
3205 3212  /*
3206 3213   * Public interface for updating per-zone load averages.  Called once per
3207 3214   * second.
3208 3215   *
3209 3216   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3210 3217   */
3211 3218  void
3212 3219  zone_loadavg_update(void)
3213 3220  {
3214 3221          zone_t *zp;
3215 3222          zone_status_t status;
3216 3223          struct loadavg_s *lavg;
3217 3224          hrtime_t zone_total;
3218 3225          uint64_t tmp;
3219 3226          int i;
3220 3227          hrtime_t hr_avg;
3221 3228          int nrun;
3222 3229          static int64_t f[3] = { 135, 27, 9 };
3223 3230          int64_t q, r;
3224 3231  
3225 3232          mutex_enter(&zonehash_lock);
3226 3233          for (zp = list_head(&zone_active); zp != NULL;
3227 3234              zp = list_next(&zone_active, zp)) {
3228 3235                  mutex_enter(&zp->zone_lock);
3229 3236  
3230 3237                  /* Skip zones that are on the way down or not yet up */
3231 3238                  status = zone_status_get(zp);
3232 3239                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3233 3240                          /* For all practical purposes the zone doesn't exist. */
3234 3241                          mutex_exit(&zp->zone_lock);
3235 3242                          continue;
3236 3243                  }
3237 3244  
3238 3245                  /*
3239 3246                   * Update the 10 second moving average data in zone_loadavg.
3240 3247                   */
3241 3248                  lavg = &zp->zone_loadavg;
3242 3249  
3243 3250                  tmp = cpu_uarray_sum_all(zp->zone_ustate);
3244 3251                  zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3245 3252  
3246 3253                  scalehrtime(&zone_total);
3247 3254  
3248 3255                  /* The zone_total should always be increasing. */
3249 3256                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3250 3257                      zone_total - lavg->lg_total : 0;
3251 3258                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3252 3259                  /* lg_total holds the prev. 1 sec. total */
3253 3260                  lavg->lg_total = zone_total;
3254 3261  
3255 3262                  /*
3256 3263                   * To simplify the calculation, we don't calculate the load avg.
3257 3264                   * until the zone has been up for at least 10 seconds and our
3258 3265                   * moving average is thus full.
3259 3266                   */
3260 3267                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3261 3268                          lavg->lg_len++;
3262 3269                          mutex_exit(&zp->zone_lock);
3263 3270                          continue;
3264 3271                  }
3265 3272  
3266 3273                  /* Now calculate the 1min, 5min, 15 min load avg. */
3267 3274                  hr_avg = 0;
3268 3275                  for (i = 0; i < S_LOADAVG_SZ; i++)
3269 3276                          hr_avg += lavg->lg_loads[i];
3270 3277                  hr_avg = hr_avg / S_LOADAVG_SZ;
3271 3278                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3272 3279  
3273 3280                  /* Compute load avg. See comment in calcloadavg() */
3274 3281                  for (i = 0; i < 3; i++) {
3275 3282                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3276 3283                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3277 3284                          zp->zone_hp_avenrun[i] +=
3278 3285                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3279 3286  
3280 3287                          /* avenrun[] can only hold 31 bits of load avg. */
3281 3288                          if (zp->zone_hp_avenrun[i] <
3282 3289                              ((uint64_t)1<<(31+16-FSHIFT)))
3283 3290                                  zp->zone_avenrun[i] = (int32_t)
3284 3291                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3285 3292                          else
3286 3293                                  zp->zone_avenrun[i] = 0x7fffffff;
3287 3294                  }
3288 3295  
3289 3296                  mutex_exit(&zp->zone_lock);
3290 3297          }
3291 3298          mutex_exit(&zonehash_lock);
3292 3299  }
3293 3300  
3294 3301  /*
3295 3302   * Get the number of cpus visible to this zone.  The system-wide global
3296 3303   * 'ncpus' is returned if pools are disabled, the caller is in the
3297 3304   * global zone, or a NULL zone argument is passed in.
3298 3305   */
3299 3306  int
3300 3307  zone_ncpus_get(zone_t *zone)
3301 3308  {
3302 3309          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3303 3310  
3304 3311          return (myncpus != 0 ? myncpus : ncpus);
3305 3312  }
3306 3313  
3307 3314  /*
3308 3315   * Get the number of online cpus visible to this zone.  The system-wide
3309 3316   * global 'ncpus_online' is returned if pools are disabled, the caller
3310 3317   * is in the global zone, or a NULL zone argument is passed in.
3311 3318   */
3312 3319  int
3313 3320  zone_ncpus_online_get(zone_t *zone)
3314 3321  {
3315 3322          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3316 3323  
3317 3324          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3318 3325  }
3319 3326  
3320 3327  /*
3321 3328   * Return the pool to which the zone is currently bound.
3322 3329   */
3323 3330  pool_t *
3324 3331  zone_pool_get(zone_t *zone)
3325 3332  {
3326 3333          ASSERT(pool_lock_held());
3327 3334  
3328 3335          return (zone->zone_pool);
3329 3336  }
3330 3337  
3331 3338  /*
3332 3339   * Set the zone's pool pointer and update the zone's visibility to match
3333 3340   * the resources in the new pool.
3334 3341   */
3335 3342  void
3336 3343  zone_pool_set(zone_t *zone, pool_t *pool)
3337 3344  {
3338 3345          ASSERT(pool_lock_held());
3339 3346          ASSERT(MUTEX_HELD(&cpu_lock));
3340 3347  
3341 3348          zone->zone_pool = pool;
3342 3349          zone_pset_set(zone, pool->pool_pset->pset_id);
3343 3350  }
3344 3351  
3345 3352  /*
3346 3353   * Return the cached value of the id of the processor set to which the
3347 3354   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3348 3355   * facility is disabled.
3349 3356   */
3350 3357  psetid_t
3351 3358  zone_pset_get(zone_t *zone)
3352 3359  {
3353 3360          ASSERT(MUTEX_HELD(&cpu_lock));
3354 3361  
3355 3362          return (zone->zone_psetid);
3356 3363  }
3357 3364  
3358 3365  /*
3359 3366   * Set the cached value of the id of the processor set to which the zone
3360 3367   * is currently bound.  Also update the zone's visibility to match the
3361 3368   * resources in the new processor set.
3362 3369   */
3363 3370  void
3364 3371  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3365 3372  {
3366 3373          psetid_t oldpsetid;
3367 3374  
3368 3375          ASSERT(MUTEX_HELD(&cpu_lock));
3369 3376          oldpsetid = zone_pset_get(zone);
3370 3377  
3371 3378          if (oldpsetid == newpsetid)
3372 3379                  return;
3373 3380          /*
3374 3381           * Global zone sees all.
3375 3382           */
3376 3383          if (zone != global_zone) {
3377 3384                  zone->zone_psetid = newpsetid;
3378 3385                  if (newpsetid != ZONE_PS_INVAL)
3379 3386                          pool_pset_visibility_add(newpsetid, zone);
3380 3387                  if (oldpsetid != ZONE_PS_INVAL)
3381 3388                          pool_pset_visibility_remove(oldpsetid, zone);
3382 3389          }
3383 3390          /*
3384 3391           * Disabling pools, so we should start using the global values
3385 3392           * for ncpus and ncpus_online.
3386 3393           */
3387 3394          if (newpsetid == ZONE_PS_INVAL) {
3388 3395                  zone->zone_ncpus = 0;
3389 3396                  zone->zone_ncpus_online = 0;
3390 3397          }
3391 3398  }
3392 3399  
3393 3400  /*
3394 3401   * Walk the list of active zones and issue the provided callback for
3395 3402   * each of them.
3396 3403   *
3397 3404   * Caller must not be holding any locks that may be acquired under
3398 3405   * zonehash_lock.  See comment at the beginning of the file for a list of
3399 3406   * common locks and their interactions with zones.
3400 3407   */
3401 3408  int
3402 3409  zone_walk(int (*cb)(zone_t *, void *), void *data)
3403 3410  {
3404 3411          zone_t *zone;
3405 3412          int ret = 0;
3406 3413          zone_status_t status;
3407 3414  
3408 3415          mutex_enter(&zonehash_lock);
3409 3416          for (zone = list_head(&zone_active); zone != NULL;
3410 3417              zone = list_next(&zone_active, zone)) {
3411 3418                  /*
3412 3419                   * Skip zones that shouldn't be externally visible.
3413 3420                   */
3414 3421                  status = zone_status_get(zone);
3415 3422                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3416 3423                          continue;
3417 3424                  /*
3418 3425                   * Bail immediately if any callback invocation returns a
3419 3426                   * non-zero value.
3420 3427                   */
3421 3428                  ret = (*cb)(zone, data);
3422 3429                  if (ret != 0)
3423 3430                          break;
3424 3431          }
3425 3432          mutex_exit(&zonehash_lock);
3426 3433          return (ret);
3427 3434  }
3428 3435  
3429 3436  static int
3430 3437  zone_set_root(zone_t *zone, const char *upath)
3431 3438  {
3432 3439          vnode_t *vp;
3433 3440          int trycount;
3434 3441          int error = 0;
3435 3442          char *path;
3436 3443          struct pathname upn, pn;
3437 3444          size_t pathlen;
3438 3445  
3439 3446          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3440 3447                  return (error);
3441 3448  
3442 3449          pn_alloc(&pn);
3443 3450  
3444 3451          /* prevent infinite loop */
3445 3452          trycount = 10;
3446 3453          for (;;) {
3447 3454                  if (--trycount <= 0) {
3448 3455                          error = ESTALE;
3449 3456                          goto out;
3450 3457                  }
3451 3458  
3452 3459                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3453 3460                          /*
3454 3461                           * VOP_ACCESS() may cover 'vp' with a new
3455 3462                           * filesystem, if 'vp' is an autoFS vnode.
3456 3463                           * Get the new 'vp' if so.
3457 3464                           */
3458 3465                          if ((error =
3459 3466                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3460 3467                              (!vn_ismntpt(vp) ||
3461 3468                              (error = traverse(&vp)) == 0)) {
3462 3469                                  pathlen = pn.pn_pathlen + 2;
3463 3470                                  path = kmem_alloc(pathlen, KM_SLEEP);
3464 3471                                  (void) strncpy(path, pn.pn_path,
3465 3472                                      pn.pn_pathlen + 1);
3466 3473                                  path[pathlen - 2] = '/';
3467 3474                                  path[pathlen - 1] = '\0';
3468 3475                                  pn_free(&pn);
3469 3476                                  pn_free(&upn);
3470 3477  
3471 3478                                  /* Success! */
3472 3479                                  break;
3473 3480                          }
3474 3481                          VN_RELE(vp);
3475 3482                  }
3476 3483                  if (error != ESTALE)
3477 3484                          goto out;
3478 3485          }
3479 3486  
3480 3487          ASSERT(error == 0);
3481 3488          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3482 3489          zone->zone_rootpath = path;
3483 3490          zone->zone_rootpathlen = pathlen;
3484 3491          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3485 3492                  zone->zone_flags |= ZF_IS_SCRATCH;
3486 3493          return (0);
3487 3494  
3488 3495  out:
3489 3496          pn_free(&pn);
3490 3497          pn_free(&upn);
3491 3498          return (error);
3492 3499  }
3493 3500  
3494 3501  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3495 3502                          ((c) >= 'a' && (c) <= 'z') || \
3496 3503                          ((c) >= 'A' && (c) <= 'Z'))
3497 3504  
3498 3505  static int
3499 3506  zone_set_name(zone_t *zone, const char *uname)
3500 3507  {
3501 3508          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3502 3509          size_t len;
3503 3510          int i, err;
3504 3511  
3505 3512          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3506 3513                  kmem_free(kname, ZONENAME_MAX);
3507 3514                  return (err);   /* EFAULT or ENAMETOOLONG */
3508 3515          }
3509 3516  
3510 3517          /* must be less than ZONENAME_MAX */
3511 3518          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3512 3519                  kmem_free(kname, ZONENAME_MAX);
3513 3520                  return (EINVAL);
3514 3521          }
3515 3522  
3516 3523          /*
3517 3524           * Name must start with an alphanumeric and must contain only
3518 3525           * alphanumerics, '-', '_' and '.'.
3519 3526           */
3520 3527          if (!isalnum(kname[0])) {
3521 3528                  kmem_free(kname, ZONENAME_MAX);
3522 3529                  return (EINVAL);
3523 3530          }
3524 3531          for (i = 1; i < len - 1; i++) {
3525 3532                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3526 3533                      kname[i] != '.') {
3527 3534                          kmem_free(kname, ZONENAME_MAX);
3528 3535                          return (EINVAL);
3529 3536                  }
3530 3537          }
3531 3538  
3532 3539          zone->zone_name = kname;
3533 3540          return (0);
3534 3541  }
3535 3542  
3536 3543  /*
3537 3544   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3538 3545   * is NULL or it points to a zone with no hostid emulation, then the machine's
3539 3546   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3540 3547   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3541 3548   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3542 3549   * hostid and the machine's hostid is invalid.
3543 3550   */
3544 3551  uint32_t
3545 3552  zone_get_hostid(zone_t *zonep)
3546 3553  {
3547 3554          unsigned long machine_hostid;
3548 3555  
3549 3556          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3550 3557                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3551 3558                          return (HW_INVALID_HOSTID);
3552 3559                  return ((uint32_t)machine_hostid);
3553 3560          }
3554 3561          return (zonep->zone_hostid);
3555 3562  }
3556 3563  
3557 3564  /*
3558 3565   * Similar to thread_create(), but makes sure the thread is in the appropriate
3559 3566   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3560 3567   */
3561 3568  /*ARGSUSED*/
3562 3569  kthread_t *
3563 3570  zthread_create(
3564 3571      caddr_t stk,
3565 3572      size_t stksize,
3566 3573      void (*proc)(),
3567 3574      void *arg,
3568 3575      size_t len,
3569 3576      pri_t pri)
3570 3577  {
3571 3578          kthread_t *t;
3572 3579          zone_t *zone = curproc->p_zone;
3573 3580          proc_t *pp = zone->zone_zsched;
3574 3581  
3575 3582          zone_hold(zone);        /* Reference to be dropped when thread exits */
3576 3583  
3577 3584          /*
3578 3585           * No-one should be trying to create threads if the zone is shutting
3579 3586           * down and there aren't any kernel threads around.  See comment
3580 3587           * in zthread_exit().
3581 3588           */
3582 3589          ASSERT(!(zone->zone_kthreads == NULL &&
3583 3590              zone_status_get(zone) >= ZONE_IS_EMPTY));
3584 3591          /*
3585 3592           * Create a thread, but don't let it run until we've finished setting
3586 3593           * things up.
3587 3594           */
3588 3595          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3589 3596          ASSERT(t->t_forw == NULL);
3590 3597          mutex_enter(&zone_status_lock);
3591 3598          if (zone->zone_kthreads == NULL) {
3592 3599                  t->t_forw = t->t_back = t;
3593 3600          } else {
3594 3601                  kthread_t *tx = zone->zone_kthreads;
3595 3602  
3596 3603                  t->t_forw = tx;
3597 3604                  t->t_back = tx->t_back;
3598 3605                  tx->t_back->t_forw = t;
3599 3606                  tx->t_back = t;
3600 3607          }
3601 3608          zone->zone_kthreads = t;
3602 3609          mutex_exit(&zone_status_lock);
3603 3610  
3604 3611          mutex_enter(&pp->p_lock);
3605 3612          t->t_proc_flag |= TP_ZTHREAD;
3606 3613          project_rele(t->t_proj);
3607 3614          t->t_proj = project_hold(pp->p_task->tk_proj);
3608 3615  
3609 3616          /*
3610 3617           * Setup complete, let it run.
3611 3618           */
3612 3619          thread_lock(t);
3613 3620          t->t_schedflag |= TS_ALLSTART;
3614 3621          setrun_locked(t);
3615 3622          thread_unlock(t);
3616 3623  
3617 3624          mutex_exit(&pp->p_lock);
3618 3625  
3619 3626          return (t);
3620 3627  }
3621 3628  
3622 3629  /*
3623 3630   * Similar to thread_exit().  Must be called by threads created via
3624 3631   * zthread_exit().
3625 3632   */
3626 3633  void
3627 3634  zthread_exit(void)
3628 3635  {
3629 3636          kthread_t *t = curthread;
3630 3637          proc_t *pp = curproc;
3631 3638          zone_t *zone = pp->p_zone;
3632 3639  
3633 3640          mutex_enter(&zone_status_lock);
3634 3641  
3635 3642          /*
3636 3643           * Reparent to p0
3637 3644           */
3638 3645          kpreempt_disable();
3639 3646          mutex_enter(&pp->p_lock);
3640 3647          t->t_proc_flag &= ~TP_ZTHREAD;
3641 3648          t->t_procp = &p0;
3642 3649          hat_thread_exit(t);
3643 3650          mutex_exit(&pp->p_lock);
3644 3651          kpreempt_enable();
3645 3652  
3646 3653          if (t->t_back == t) {
3647 3654                  ASSERT(t->t_forw == t);
3648 3655                  /*
3649 3656                   * If the zone is empty, once the thread count
3650 3657                   * goes to zero no further kernel threads can be
3651 3658                   * created.  This is because if the creator is a process
3652 3659                   * in the zone, then it must have exited before the zone
3653 3660                   * state could be set to ZONE_IS_EMPTY.
3654 3661                   * Otherwise, if the creator is a kernel thread in the
3655 3662                   * zone, the thread count is non-zero.
3656 3663                   *
3657 3664                   * This really means that non-zone kernel threads should
3658 3665                   * not create zone kernel threads.
3659 3666                   */
3660 3667                  zone->zone_kthreads = NULL;
3661 3668                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3662 3669                          zone_status_set(zone, ZONE_IS_DOWN);
3663 3670                          /*
3664 3671                           * Remove any CPU caps on this zone.
3665 3672                           */
3666 3673                          cpucaps_zone_remove(zone);
3667 3674                  }
3668 3675          } else {
3669 3676                  t->t_forw->t_back = t->t_back;
3670 3677                  t->t_back->t_forw = t->t_forw;
3671 3678                  if (zone->zone_kthreads == t)
3672 3679                          zone->zone_kthreads = t->t_forw;
3673 3680          }
3674 3681          mutex_exit(&zone_status_lock);
3675 3682          zone_rele(zone);
3676 3683          thread_exit();
3677 3684          /* NOTREACHED */
3678 3685  }
3679 3686  
3680 3687  static void
3681 3688  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3682 3689  {
3683 3690          vnode_t *oldvp;
3684 3691  
3685 3692          /* we're going to hold a reference here to the directory */
3686 3693          VN_HOLD(vp);
3687 3694  
3688 3695          /* update abs cwd/root path see c2/audit.c */
3689 3696          if (AU_AUDITING())
3690 3697                  audit_chdirec(vp, vpp);
3691 3698  
3692 3699          mutex_enter(&pp->p_lock);
3693 3700          oldvp = *vpp;
3694 3701          *vpp = vp;
3695 3702          mutex_exit(&pp->p_lock);
3696 3703          if (oldvp != NULL)
3697 3704                  VN_RELE(oldvp);
3698 3705  }
3699 3706  
3700 3707  /*
3701 3708   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3702 3709   */
3703 3710  static int
3704 3711  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3705 3712  {
3706 3713          nvpair_t *nvp = NULL;
3707 3714          boolean_t priv_set = B_FALSE;
3708 3715          boolean_t limit_set = B_FALSE;
3709 3716          boolean_t action_set = B_FALSE;
3710 3717  
3711 3718          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3712 3719                  const char *name;
3713 3720                  uint64_t ui64;
3714 3721  
3715 3722                  name = nvpair_name(nvp);
3716 3723                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3717 3724                          return (EINVAL);
3718 3725                  (void) nvpair_value_uint64(nvp, &ui64);
3719 3726                  if (strcmp(name, "privilege") == 0) {
3720 3727                          /*
3721 3728                           * Currently only privileged values are allowed, but
3722 3729                           * this may change in the future.
3723 3730                           */
3724 3731                          if (ui64 != RCPRIV_PRIVILEGED)
3725 3732                                  return (EINVAL);
3726 3733                          rv->rcv_privilege = ui64;
3727 3734                          priv_set = B_TRUE;
3728 3735                  } else if (strcmp(name, "limit") == 0) {
3729 3736                          rv->rcv_value = ui64;
3730 3737                          limit_set = B_TRUE;
3731 3738                  } else if (strcmp(name, "action") == 0) {
3732 3739                          if (ui64 != RCTL_LOCAL_NOACTION &&
3733 3740                              ui64 != RCTL_LOCAL_DENY)
3734 3741                                  return (EINVAL);
3735 3742                          rv->rcv_flagaction = ui64;
3736 3743                          action_set = B_TRUE;
3737 3744                  } else {
3738 3745                          return (EINVAL);
3739 3746                  }
3740 3747          }
3741 3748  
3742 3749          if (!(priv_set && limit_set && action_set))
3743 3750                  return (EINVAL);
3744 3751          rv->rcv_action_signal = 0;
3745 3752          rv->rcv_action_recipient = NULL;
3746 3753          rv->rcv_action_recip_pid = -1;
3747 3754          rv->rcv_firing_time = 0;
3748 3755  
3749 3756          return (0);
3750 3757  }
3751 3758  
3752 3759  /*
3753 3760   * Non-global zone version of start_init.
3754 3761   */
3755 3762  void
3756 3763  zone_start_init(void)
3757 3764  {
3758 3765          proc_t *p = ttoproc(curthread);
3759 3766          zone_t *z = p->p_zone;
3760 3767  
3761 3768          ASSERT(!INGLOBALZONE(curproc));
3762 3769  
3763 3770          /*
3764 3771           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3765 3772           * storing just the pid of init is sufficient.
3766 3773           */
3767 3774          z->zone_proc_initpid = p->p_pid;
3768 3775  
3769 3776          /*
3770 3777           * We maintain zone_boot_err so that we can return the cause of the
3771 3778           * failure back to the caller of the zone_boot syscall.
3772 3779           */
3773 3780          p->p_zone->zone_boot_err = start_init_common();
3774 3781  
3775 3782          /*
3776 3783           * We will prevent booting zones from becoming running zones if the
3777 3784           * global zone is shutting down.
3778 3785           */
3779 3786          mutex_enter(&zone_status_lock);
3780 3787          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3781 3788              ZONE_IS_SHUTTING_DOWN) {
3782 3789                  /*
3783 3790                   * Make sure we are still in the booting state-- we could have
3784 3791                   * raced and already be shutting down, or even further along.
3785 3792                   */
3786 3793                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3787 3794                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3788 3795                  }
3789 3796                  mutex_exit(&zone_status_lock);
3790 3797                  /* It's gone bad, dispose of the process */
3791 3798                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3792 3799                          mutex_enter(&p->p_lock);
3793 3800                          ASSERT(p->p_flag & SEXITLWPS);
3794 3801                          lwp_exit();
3795 3802                  }
3796 3803          } else {
3797 3804                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3798 3805                          zone_status_set(z, ZONE_IS_RUNNING);
3799 3806                  mutex_exit(&zone_status_lock);
3800 3807                  /* cause the process to return to userland. */
3801 3808                  lwp_rtt();
3802 3809          }
3803 3810  }
3804 3811  
3805 3812  struct zsched_arg {
3806 3813          zone_t *zone;
3807 3814          nvlist_t *nvlist;
3808 3815  };
3809 3816  
3810 3817  /*
3811 3818   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3812 3819   * anything to do with scheduling, but rather with the fact that
3813 3820   * per-zone kernel threads are parented to zsched, just like regular
3814 3821   * kernel threads are parented to sched (p0).
3815 3822   *
3816 3823   * zsched is also responsible for launching init for the zone.
3817 3824   */
3818 3825  static void
3819 3826  zsched(void *arg)
3820 3827  {
3821 3828          struct zsched_arg *za = arg;
3822 3829          proc_t *pp = curproc;
3823 3830          proc_t *initp = proc_init;
3824 3831          zone_t *zone = za->zone;
3825 3832          cred_t *cr, *oldcred;
3826 3833          rctl_set_t *set;
3827 3834          rctl_alloc_gp_t *gp;
3828 3835          contract_t *ct = NULL;
3829 3836          task_t *tk, *oldtk;
3830 3837          rctl_entity_p_t e;
3831 3838          kproject_t *pj;
3832 3839  
3833 3840          nvlist_t *nvl = za->nvlist;
3834 3841          nvpair_t *nvp = NULL;
3835 3842  
3836 3843          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3837 3844          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3838 3845          PTOU(pp)->u_argc = 0;
3839 3846          PTOU(pp)->u_argv = 0;
3840 3847          PTOU(pp)->u_envp = 0;
3841 3848          PTOU(pp)->u_commpagep = 0;
3842 3849          closeall(P_FINFO(pp));
3843 3850  
3844 3851          /*
3845 3852           * We are this zone's "zsched" process.  As the zone isn't generally
3846 3853           * visible yet we don't need to grab any locks before initializing its
3847 3854           * zone_proc pointer.
3848 3855           */
3849 3856          zone_hold(zone);  /* this hold is released by zone_destroy() */
3850 3857          zone->zone_zsched = pp;
3851 3858          mutex_enter(&pp->p_lock);
3852 3859          pp->p_zone = zone;
3853 3860          mutex_exit(&pp->p_lock);
3854 3861  
3855 3862          /*
3856 3863           * Disassociate process from its 'parent'; parent ourselves to init
3857 3864           * (pid 1) and change other values as needed.
3858 3865           */
3859 3866          sess_create();
3860 3867  
3861 3868          mutex_enter(&pidlock);
3862 3869          proc_detach(pp);
3863 3870          pp->p_ppid = 1;
3864 3871          pp->p_flag |= SZONETOP;
3865 3872          pp->p_ancpid = 1;
3866 3873          pp->p_parent = initp;
3867 3874          pp->p_psibling = NULL;
3868 3875          if (initp->p_child)
3869 3876                  initp->p_child->p_psibling = pp;
3870 3877          pp->p_sibling = initp->p_child;
3871 3878          initp->p_child = pp;
3872 3879  
3873 3880          /* Decrement what newproc() incremented. */
3874 3881          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3875 3882          /*
3876 3883           * Our credentials are about to become kcred-like, so we don't care
3877 3884           * about the caller's ruid.
3878 3885           */
3879 3886          upcount_inc(crgetruid(kcred), zone->zone_id);
3880 3887          mutex_exit(&pidlock);
3881 3888  
3882 3889          /*
3883 3890           * getting out of global zone, so decrement lwp and process counts
3884 3891           */
3885 3892          pj = pp->p_task->tk_proj;
3886 3893          mutex_enter(&global_zone->zone_nlwps_lock);
3887 3894          pj->kpj_nlwps -= pp->p_lwpcnt;
3888 3895          global_zone->zone_nlwps -= pp->p_lwpcnt;
3889 3896          pj->kpj_nprocs--;
3890 3897          global_zone->zone_nprocs--;
3891 3898          mutex_exit(&global_zone->zone_nlwps_lock);
3892 3899  
3893 3900          /*
3894 3901           * Decrement locked memory counts on old zone and project.
3895 3902           */
3896 3903          mutex_enter(&global_zone->zone_mem_lock);
3897 3904          global_zone->zone_locked_mem -= pp->p_locked_mem;
3898 3905          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3899 3906          mutex_exit(&global_zone->zone_mem_lock);
3900 3907  
3901 3908          /*
3902 3909           * Create and join a new task in project '0' of this zone.
3903 3910           *
3904 3911           * We don't need to call holdlwps() since we know we're the only lwp in
3905 3912           * this process.
3906 3913           *
3907 3914           * task_join() returns with p_lock held.
3908 3915           */
3909 3916          tk = task_create(0, zone);
3910 3917          mutex_enter(&cpu_lock);
3911 3918          oldtk = task_join(tk, 0);
3912 3919  
3913 3920          pj = pp->p_task->tk_proj;
3914 3921  
3915 3922          mutex_enter(&zone->zone_mem_lock);
3916 3923          zone->zone_locked_mem += pp->p_locked_mem;
3917 3924          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3918 3925          mutex_exit(&zone->zone_mem_lock);
3919 3926  
3920 3927          /*
3921 3928           * add lwp and process counts to zsched's zone, and increment
3922 3929           * project's task and process count due to the task created in
3923 3930           * the above task_create.
3924 3931           */
3925 3932          mutex_enter(&zone->zone_nlwps_lock);
3926 3933          pj->kpj_nlwps += pp->p_lwpcnt;
3927 3934          pj->kpj_ntasks += 1;
3928 3935          zone->zone_nlwps += pp->p_lwpcnt;
3929 3936          pj->kpj_nprocs++;
3930 3937          zone->zone_nprocs++;
3931 3938          mutex_exit(&zone->zone_nlwps_lock);
3932 3939  
3933 3940          mutex_exit(&curproc->p_lock);
3934 3941          mutex_exit(&cpu_lock);
3935 3942          task_rele(oldtk);
3936 3943  
3937 3944          /*
3938 3945           * The process was created by a process in the global zone, hence the
3939 3946           * credentials are wrong.  We might as well have kcred-ish credentials.
3940 3947           */
3941 3948          cr = zone->zone_kcred;
3942 3949          crhold(cr);
3943 3950          mutex_enter(&pp->p_crlock);
3944 3951          oldcred = pp->p_cred;
3945 3952          pp->p_cred = cr;
3946 3953          mutex_exit(&pp->p_crlock);
3947 3954          crfree(oldcred);
3948 3955  
3949 3956          /*
3950 3957           * Hold credentials again (for thread)
3951 3958           */
3952 3959          crhold(cr);
3953 3960  
3954 3961          /*
3955 3962           * p_lwpcnt can't change since this is a kernel process.
3956 3963           */
3957 3964          crset(pp, cr);
3958 3965  
3959 3966          /*
3960 3967           * Chroot
3961 3968           */
3962 3969          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3963 3970          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3964 3971  
3965 3972          /*
3966 3973           * Initialize zone's rctl set.
3967 3974           */
3968 3975          set = rctl_set_create();
3969 3976          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3970 3977          mutex_enter(&pp->p_lock);
3971 3978          e.rcep_p.zone = zone;
3972 3979          e.rcep_t = RCENTITY_ZONE;
3973 3980          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3974 3981          mutex_exit(&pp->p_lock);
3975 3982          rctl_prealloc_destroy(gp);
3976 3983  
3977 3984          /*
3978 3985           * Apply the rctls passed in to zone_create().  This is basically a list
3979 3986           * assignment: all of the old values are removed and the new ones
3980 3987           * inserted.  That is, if an empty list is passed in, all values are
3981 3988           * removed.
3982 3989           */
3983 3990          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3984 3991                  rctl_dict_entry_t *rde;
3985 3992                  rctl_hndl_t hndl;
3986 3993                  char *name;
3987 3994                  nvlist_t **nvlarray;
3988 3995                  uint_t i, nelem;
3989 3996                  int error;      /* For ASSERT()s */
3990 3997  
3991 3998                  name = nvpair_name(nvp);
3992 3999                  hndl = rctl_hndl_lookup(name);
3993 4000                  ASSERT(hndl != -1);
3994 4001                  rde = rctl_dict_lookup_hndl(hndl);
3995 4002                  ASSERT(rde != NULL);
3996 4003  
3997 4004                  for (; /* ever */; ) {
3998 4005                          rctl_val_t oval;
3999 4006  
4000 4007                          mutex_enter(&pp->p_lock);
4001 4008                          error = rctl_local_get(hndl, NULL, &oval, pp);
4002 4009                          mutex_exit(&pp->p_lock);
4003 4010                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4004 4011                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4005 4012                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
4006 4013                                  break;
4007 4014                          mutex_enter(&pp->p_lock);
4008 4015                          error = rctl_local_delete(hndl, &oval, pp);
4009 4016                          mutex_exit(&pp->p_lock);
4010 4017                          ASSERT(error == 0);
4011 4018                  }
4012 4019                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4013 4020                  ASSERT(error == 0);
4014 4021                  for (i = 0; i < nelem; i++) {
4015 4022                          rctl_val_t *nvalp;
4016 4023  
4017 4024                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4018 4025                          error = nvlist2rctlval(nvlarray[i], nvalp);
4019 4026                          ASSERT(error == 0);
4020 4027                          /*
4021 4028                           * rctl_local_insert can fail if the value being
4022 4029                           * inserted is a duplicate; this is OK.
4023 4030                           */
4024 4031                          mutex_enter(&pp->p_lock);
4025 4032                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
4026 4033                                  kmem_cache_free(rctl_val_cache, nvalp);
4027 4034                          mutex_exit(&pp->p_lock);
4028 4035                  }
4029 4036          }
4030 4037  
4031 4038          /*
4032 4039           * Tell the world that we're done setting up.
4033 4040           *
4034 4041           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4035 4042           * and atomically set the zone's processor set visibility.  Once
4036 4043           * we drop pool_lock() this zone will automatically get updated
4037 4044           * to reflect any future changes to the pools configuration.
4038 4045           *
4039 4046           * Note that after we drop the locks below (zonehash_lock in
4040 4047           * particular) other operations such as a zone_getattr call can
4041 4048           * now proceed and observe the zone. That is the reason for doing a
4042 4049           * state transition to the INITIALIZED state.
4043 4050           */
4044 4051          pool_lock();
4045 4052          mutex_enter(&cpu_lock);
4046 4053          mutex_enter(&zonehash_lock);
4047 4054          zone_uniqid(zone);
4048 4055          zone_zsd_configure(zone);
4049 4056          if (pool_state == POOL_ENABLED)
4050 4057                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
4051 4058          mutex_enter(&zone_status_lock);
4052 4059          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4053 4060          zone_status_set(zone, ZONE_IS_INITIALIZED);
4054 4061          mutex_exit(&zone_status_lock);
4055 4062          mutex_exit(&zonehash_lock);
4056 4063          mutex_exit(&cpu_lock);
4057 4064          pool_unlock();
4058 4065  
4059 4066          /* Now call the create callback for this key */
4060 4067          zsd_apply_all_keys(zsd_apply_create, zone);
4061 4068  
4062 4069          /* The callbacks are complete. Mark ZONE_IS_READY */
4063 4070          mutex_enter(&zone_status_lock);
4064 4071          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4065 4072          zone_status_set(zone, ZONE_IS_READY);
4066 4073          mutex_exit(&zone_status_lock);
4067 4074  
4068 4075          /*
4069 4076           * Once we see the zone transition to the ZONE_IS_BOOTING state,
4070 4077           * we launch init, and set the state to running.
4071 4078           */
4072 4079          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4073 4080  
4074 4081          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4075 4082                  id_t cid;
4076 4083  
4077 4084                  /*
4078 4085                   * Ok, this is a little complicated.  We need to grab the
4079 4086                   * zone's pool's scheduling class ID; note that by now, we
4080 4087                   * are already bound to a pool if we need to be (zoneadmd
4081 4088                   * will have done that to us while we're in the READY
4082 4089                   * state).  *But* the scheduling class for the zone's 'init'
4083 4090                   * must be explicitly passed to newproc, which doesn't
4084 4091                   * respect pool bindings.
4085 4092                   *
4086 4093                   * We hold the pool_lock across the call to newproc() to
4087 4094                   * close the obvious race: the pool's scheduling class
4088 4095                   * could change before we manage to create the LWP with
4089 4096                   * classid 'cid'.
4090 4097                   */
4091 4098                  pool_lock();
4092 4099                  if (zone->zone_defaultcid > 0)
4093 4100                          cid = zone->zone_defaultcid;
4094 4101                  else
4095 4102                          cid = pool_get_class(zone->zone_pool);
4096 4103                  if (cid == -1)
4097 4104                          cid = defaultcid;
4098 4105  
4099 4106                  /*
4100 4107                   * If this fails, zone_boot will ultimately fail.  The
4101 4108                   * state of the zone will be set to SHUTTING_DOWN-- userland
4102 4109                   * will have to tear down the zone, and fail, or try again.
4103 4110                   */
4104 4111                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4105 4112                      minclsyspri - 1, &ct, 0)) != 0) {
4106 4113                          mutex_enter(&zone_status_lock);
4107 4114                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4108 4115                          mutex_exit(&zone_status_lock);
4109 4116                  } else {
4110 4117                          zone->zone_boot_time = gethrestime_sec();
4111 4118                  }
4112 4119  
4113 4120                  pool_unlock();
4114 4121          }
4115 4122  
4116 4123          /*
4117 4124           * Wait for zone_destroy() to be called.  This is what we spend
4118 4125           * most of our life doing.
4119 4126           */
4120 4127          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4121 4128  
4122 4129          if (ct)
4123 4130                  /*
4124 4131                   * At this point the process contract should be empty.
4125 4132                   * (Though if it isn't, it's not the end of the world.)
4126 4133                   */
4127 4134                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4128 4135  
4129 4136          /*
4130 4137           * Allow kcred to be freed when all referring processes
4131 4138           * (including this one) go away.  We can't just do this in
4132 4139           * zone_free because we need to wait for the zone_cred_ref to
4133 4140           * drop to 0 before calling zone_free, and the existence of
4134 4141           * zone_kcred will prevent that.  Thus, we call crfree here to
4135 4142           * balance the crdup in zone_create.  The crhold calls earlier
4136 4143           * in zsched will be dropped when the thread and process exit.
4137 4144           */
4138 4145          crfree(zone->zone_kcred);
4139 4146          zone->zone_kcred = NULL;
4140 4147  
4141 4148          exit(CLD_EXITED, 0);
4142 4149  }
4143 4150  
4144 4151  /*
4145 4152   * Helper function to determine if there are any submounts of the
4146 4153   * provided path.  Used to make sure the zone doesn't "inherit" any
4147 4154   * mounts from before it is created.
4148 4155   */
4149 4156  static uint_t
4150 4157  zone_mount_count(const char *rootpath)
4151 4158  {
4152 4159          vfs_t *vfsp;
4153 4160          uint_t count = 0;
4154 4161          size_t rootpathlen = strlen(rootpath);
4155 4162  
4156 4163          /*
4157 4164           * Holding zonehash_lock prevents race conditions with
4158 4165           * vfs_list_add()/vfs_list_remove() since we serialize with
4159 4166           * zone_find_by_path().
4160 4167           */
4161 4168          ASSERT(MUTEX_HELD(&zonehash_lock));
4162 4169          /*
4163 4170           * The rootpath must end with a '/'
4164 4171           */
4165 4172          ASSERT(rootpath[rootpathlen - 1] == '/');
4166 4173  
4167 4174          /*
4168 4175           * This intentionally does not count the rootpath itself if that
4169 4176           * happens to be a mount point.
4170 4177           */
4171 4178          vfs_list_read_lock();
4172 4179          vfsp = rootvfs;
4173 4180          do {
4174 4181                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4175 4182                      rootpathlen) == 0)
4176 4183                          count++;
4177 4184                  vfsp = vfsp->vfs_next;
4178 4185          } while (vfsp != rootvfs);
4179 4186          vfs_list_unlock();
4180 4187          return (count);
4181 4188  }
4182 4189  
4183 4190  /*
4184 4191   * Helper function to make sure that a zone created on 'rootpath'
4185 4192   * wouldn't end up containing other zones' rootpaths.
4186 4193   */
4187 4194  static boolean_t
4188 4195  zone_is_nested(const char *rootpath)
4189 4196  {
4190 4197          zone_t *zone;
4191 4198          size_t rootpathlen = strlen(rootpath);
4192 4199          size_t len;
4193 4200  
4194 4201          ASSERT(MUTEX_HELD(&zonehash_lock));
4195 4202  
4196 4203          /*
4197 4204           * zone_set_root() appended '/' and '\0' at the end of rootpath
4198 4205           */
4199 4206          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4200 4207              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4201 4208                  return (B_TRUE);
4202 4209  
4203 4210          for (zone = list_head(&zone_active); zone != NULL;
4204 4211              zone = list_next(&zone_active, zone)) {
4205 4212                  if (zone == global_zone)
4206 4213                          continue;
4207 4214                  len = strlen(zone->zone_rootpath);
4208 4215                  if (strncmp(rootpath, zone->zone_rootpath,
4209 4216                      MIN(rootpathlen, len)) == 0)
4210 4217                          return (B_TRUE);
4211 4218          }
4212 4219          return (B_FALSE);
4213 4220  }
4214 4221  
4215 4222  static int
4216 4223  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4217 4224      size_t zone_privssz)
4218 4225  {
4219 4226          priv_set_t *privs;
4220 4227  
4221 4228          if (zone_privssz < sizeof (priv_set_t))
4222 4229                  return (ENOMEM);
4223 4230  
4224 4231          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4225 4232  
4226 4233          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4227 4234                  kmem_free(privs, sizeof (priv_set_t));
4228 4235                  return (EFAULT);
4229 4236          }
4230 4237  
4231 4238          zone->zone_privset = privs;
4232 4239          return (0);
4233 4240  }
4234 4241  
4235 4242  /*
4236 4243   * We make creative use of nvlists to pass in rctls from userland.  The list is
4237 4244   * a list of the following structures:
4238 4245   *
4239 4246   * (name = rctl_name, value = nvpair_list_array)
4240 4247   *
4241 4248   * Where each element of the nvpair_list_array is of the form:
4242 4249   *
4243 4250   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4244 4251   *      (name = "limit", value = uint64_t),
4245 4252   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4246 4253   */
4247 4254  static int
4248 4255  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4249 4256  {
4250 4257          nvpair_t *nvp = NULL;
4251 4258          nvlist_t *nvl = NULL;
4252 4259          char *kbuf;
4253 4260          int error;
4254 4261          rctl_val_t rv;
4255 4262  
4256 4263          *nvlp = NULL;
4257 4264  
4258 4265          if (buflen == 0)
4259 4266                  return (0);
4260 4267  
4261 4268          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4262 4269                  return (ENOMEM);
4263 4270          if (copyin(ubuf, kbuf, buflen)) {
4264 4271                  error = EFAULT;
4265 4272                  goto out;
4266 4273          }
4267 4274          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4268 4275                  /*
4269 4276                   * nvl may have been allocated/free'd, but the value set to
4270 4277                   * non-NULL, so we reset it here.
4271 4278                   */
4272 4279                  nvl = NULL;
4273 4280                  error = EINVAL;
4274 4281                  goto out;
4275 4282          }
4276 4283          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4277 4284                  rctl_dict_entry_t *rde;
4278 4285                  rctl_hndl_t hndl;
4279 4286                  nvlist_t **nvlarray;
4280 4287                  uint_t i, nelem;
4281 4288                  char *name;
4282 4289  
4283 4290                  error = EINVAL;
4284 4291                  name = nvpair_name(nvp);
4285 4292                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4286 4293                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4287 4294                          goto out;
4288 4295                  }
4289 4296                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4290 4297                          goto out;
4291 4298                  }
4292 4299                  rde = rctl_dict_lookup_hndl(hndl);
4293 4300                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4294 4301                  ASSERT(error == 0);
4295 4302                  for (i = 0; i < nelem; i++) {
4296 4303                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4297 4304                                  goto out;
4298 4305                  }
4299 4306                  if (rctl_invalid_value(rde, &rv)) {
4300 4307                          error = EINVAL;
4301 4308                          goto out;
4302 4309                  }
4303 4310          }
4304 4311          error = 0;
4305 4312          *nvlp = nvl;
4306 4313  out:
4307 4314          kmem_free(kbuf, buflen);
4308 4315          if (error && nvl != NULL)
4309 4316                  nvlist_free(nvl);
4310 4317          return (error);
4311 4318  }
4312 4319  
4313 4320  int
4314 4321  zone_create_error(int er_error, int er_ext, int *er_out)
4315 4322  {
4316 4323          if (er_out != NULL) {
4317 4324                  if (copyout(&er_ext, er_out, sizeof (int))) {
4318 4325                          return (set_errno(EFAULT));
4319 4326                  }
4320 4327          }
4321 4328          return (set_errno(er_error));
4322 4329  }
4323 4330  
4324 4331  static int
4325 4332  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4326 4333  {
4327 4334          ts_label_t *tsl;
4328 4335          bslabel_t blab;
4329 4336  
4330 4337          /* Get label from user */
4331 4338          if (copyin(lab, &blab, sizeof (blab)) != 0)
4332 4339                  return (EFAULT);
4333 4340          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4334 4341          if (tsl == NULL)
4335 4342                  return (ENOMEM);
4336 4343  
4337 4344          zone->zone_slabel = tsl;
4338 4345          return (0);
4339 4346  }
4340 4347  
4341 4348  /*
4342 4349   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4343 4350   */
4344 4351  static int
4345 4352  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4346 4353  {
4347 4354          char *kbuf;
4348 4355          char *dataset, *next;
4349 4356          zone_dataset_t *zd;
4350 4357          size_t len;
4351 4358  
4352 4359          if (ubuf == NULL || buflen == 0)
4353 4360                  return (0);
4354 4361  
4355 4362          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4356 4363                  return (ENOMEM);
4357 4364  
4358 4365          if (copyin(ubuf, kbuf, buflen) != 0) {
4359 4366                  kmem_free(kbuf, buflen);
4360 4367                  return (EFAULT);
4361 4368          }
4362 4369  
4363 4370          dataset = next = kbuf;
4364 4371          for (;;) {
4365 4372                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4366 4373  
4367 4374                  next = strchr(dataset, ',');
4368 4375  
4369 4376                  if (next == NULL)
4370 4377                          len = strlen(dataset);
4371 4378                  else
4372 4379                          len = next - dataset;
4373 4380  
4374 4381                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4375 4382                  bcopy(dataset, zd->zd_dataset, len);
4376 4383                  zd->zd_dataset[len] = '\0';
4377 4384  
4378 4385                  list_insert_head(&zone->zone_datasets, zd);
4379 4386  
4380 4387                  if (next == NULL)
4381 4388                          break;
4382 4389  
4383 4390                  dataset = next + 1;
4384 4391          }
4385 4392  
4386 4393          kmem_free(kbuf, buflen);
4387 4394          return (0);
4388 4395  }
4389 4396  
4390 4397  /*
4391 4398   * System call to create/initialize a new zone named 'zone_name', rooted
4392 4399   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4393 4400   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4394 4401   * with labeling set by 'match', 'doi', and 'label'.

↓ open down ↓

1362 lines elided

↑ open up ↑

4395 4402   *
4396 4403   * If extended error is non-null, we may use it to return more detailed
4397 4404   * error information.
4398 4405   */
4399 4406  static zoneid_t
4400 4407  zone_create(const char *zone_name, const char *zone_root,
4401 4408      const priv_set_t *zone_privs, size_t zone_privssz,
4402 4409      caddr_t rctlbuf, size_t rctlbufsz,
4403 4410      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4404 4411      int match, uint32_t doi, const bslabel_t *label,
4405      -    int flags)
     4412 +    int flags, zoneid_t zone_did)
4406 4413  {
4407 4414          struct zsched_arg zarg;
4408 4415          nvlist_t *rctls = NULL;
4409 4416          proc_t *pp = curproc;
4410 4417          zone_t *zone, *ztmp;
4411 4418          zoneid_t zoneid, start = GLOBAL_ZONEID;
4412 4419          int error;
4413 4420          int error2 = 0;
4414 4421          char *str;
4415 4422          cred_t *zkcr;
4416 4423          boolean_t insert_label_hash;
4417 4424  
4418 4425          if (secpolicy_zone_config(CRED()) != 0)
4419 4426                  return (set_errno(EPERM));
4420 4427  
4421 4428          /* can't boot zone from within chroot environment */
4422 4429          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4423 4430                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4424 4431                      extended_error));
     4432 +
4425 4433          /*
4426 4434           * As the first step of zone creation, we want to allocate a zoneid.
4427 4435           * This allocation is complicated by the fact that netstacks use the
4428 4436           * zoneid to determine their stackid, but netstacks themselves are
4429 4437           * freed asynchronously with respect to zone destruction.  This means
4430 4438           * that a netstack reference leak (or in principle, an extraordinarily
4431 4439           * long netstack reference hold) could result in a zoneid being
4432 4440           * allocated that in fact corresponds to a stackid from an active
4433 4441           * (referenced) netstack -- unleashing all sorts of havoc when that
4434 4442           * netstack is actually (re)used.  (In the abstract, we might wish a

4435 4443           * zoneid to not be deallocated until its last referencing netstack
4436 4444           * has been released, but netstacks lack a backpointer into their
4437 4445           * referencing zone -- and changing them to have such a pointer would
4438 4446           * be substantial, to put it euphemistically.)  To avoid this, we
4439 4447           * detect this condition on allocation: if we have allocated a zoneid
4440 4448           * that corresponds to a netstack that's still in use, we warn about
4441 4449           * it (as it is much more likely to be a reference leak than an actual
4442 4450           * netstack reference), free it, and allocate another.  That these
4443 4451           * identifers are allocated out of an ID space assures that we won't
4444 4452           * see the identifier we just allocated.
4445 4453           */
4446 4454          for (;;) {
4447 4455                  zoneid = id_alloc(zoneid_space);
4448 4456  
4449 4457                  if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4450 4458                          break;
4451 4459  
4452 4460                  id_free(zoneid_space, zoneid);
4453 4461  
4454 4462                  if (start == GLOBAL_ZONEID) {
4455 4463                          start = zoneid;
4456 4464                  } else if (zoneid == start) {
4457 4465                          /*
4458 4466                           * We have managed to iterate over the entire available
4459 4467                           * zoneid space -- there are no identifiers available,
4460 4468                           * presumably due to some number of leaked netstack
4461 4469                           * references.  While it's in principle possible for us
4462 4470                           * to continue to try, it seems wiser to give up at
4463 4471                           * this point to warn and fail explicitly with a
4464 4472                           * distinctive error.
4465 4473                           */

↓ open down ↓

31 lines elided

↑ open up ↑

4466 4474                          cmn_err(CE_WARN, "zone_create() failed: all available "
4467 4475                              "zone IDs have netstacks still in use");
4468 4476                          return (set_errno(ENFILE));
4469 4477                  }
4470 4478  
4471 4479                  cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4472 4480                      "netstack still in use", zoneid);
4473 4481          }
4474 4482  
4475 4483          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
     4484 +
4476 4485          zone->zone_id = zoneid;
     4486 +        zone->zone_did = zone_did;
4477 4487          zone->zone_status = ZONE_IS_UNINITIALIZED;
4478 4488          zone->zone_pool = pool_default;
4479 4489          zone->zone_pool_mod = gethrtime();
4480 4490          zone->zone_psetid = ZONE_PS_INVAL;
4481 4491          zone->zone_ncpus = 0;
4482 4492          zone->zone_ncpus_online = 0;
4483 4493          zone->zone_restart_init = B_TRUE;
4484 4494          zone->zone_brand = &native_brand;
4485 4495          zone->zone_initname = NULL;
4486 4496          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);

4487 4497          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4488 4498          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4489 4499          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4490 4500          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4491 4501              offsetof(zone_ref_t, zref_linkage));
4492 4502          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4493 4503              offsetof(struct zsd_entry, zsd_linkage));
4494 4504          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4495 4505              offsetof(zone_dataset_t, zd_linkage));
4496 4506          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4497 4507              offsetof(zone_dl_t, zdl_linkage));
4498 4508          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4499 4509          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4500 4510  
4501 4511          if (flags & ZCF_NET_EXCL) {
4502 4512                  zone->zone_flags |= ZF_NET_EXCL;
4503 4513          }
4504 4514  
4505 4515          if ((error = zone_set_name(zone, zone_name)) != 0) {
4506 4516                  zone_free(zone);
4507 4517                  return (zone_create_error(error, 0, extended_error));
4508 4518          }
4509 4519  
4510 4520          if ((error = zone_set_root(zone, zone_root)) != 0) {
4511 4521                  zone_free(zone);
4512 4522                  return (zone_create_error(error, 0, extended_error));
4513 4523          }
4514 4524          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4515 4525                  zone_free(zone);
4516 4526                  return (zone_create_error(error, 0, extended_error));
4517 4527          }
4518 4528  
4519 4529          /* initialize node name to be the same as zone name */
4520 4530          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4521 4531          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4522 4532          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4523 4533  
4524 4534          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4525 4535          zone->zone_domain[0] = '\0';
4526 4536          zone->zone_hostid = HW_INVALID_HOSTID;
4527 4537          zone->zone_shares = 1;
4528 4538          zone->zone_shmmax = 0;
4529 4539          zone->zone_ipc.ipcq_shmmni = 0;
4530 4540          zone->zone_ipc.ipcq_semmni = 0;
4531 4541          zone->zone_ipc.ipcq_msgmni = 0;
4532 4542          zone->zone_bootargs = NULL;
4533 4543          zone->zone_fs_allowed = NULL;
4534 4544  
4535 4545          psecflags_default(&zone->zone_secflags);
4536 4546  
4537 4547          zone->zone_initname =
4538 4548              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4539 4549          (void) strcpy(zone->zone_initname, zone_default_initname);
4540 4550          zone->zone_nlwps = 0;
4541 4551          zone->zone_nlwps_ctl = INT_MAX;
4542 4552          zone->zone_nprocs = 0;
4543 4553          zone->zone_nprocs_ctl = INT_MAX;
4544 4554          zone->zone_locked_mem = 0;
4545 4555          zone->zone_locked_mem_ctl = UINT64_MAX;
4546 4556          zone->zone_max_swap = 0;
4547 4557          zone->zone_max_swap_ctl = UINT64_MAX;
4548 4558          zone->zone_max_lofi = 0;
4549 4559          zone->zone_max_lofi_ctl = UINT64_MAX;
4550 4560          zone0.zone_lockedmem_kstat = NULL;
4551 4561          zone0.zone_swapresv_kstat = NULL;
4552 4562  
4553 4563          zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
4554 4564  
4555 4565          /*
4556 4566           * Zsched initializes the rctls.
4557 4567           */
4558 4568          zone->zone_rctls = NULL;
4559 4569  
4560 4570          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4561 4571                  zone_free(zone);
4562 4572                  return (zone_create_error(error, 0, extended_error));
4563 4573          }
4564 4574  
4565 4575          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4566 4576                  zone_free(zone);
4567 4577                  return (set_errno(error));
4568 4578          }
4569 4579  
4570 4580          /*
4571 4581           * Read in the trusted system parameters:
4572 4582           * match flag and sensitivity label.
4573 4583           */
4574 4584          zone->zone_match = match;
4575 4585          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4576 4586                  /* Fail if requested to set doi to anything but system's doi */
4577 4587                  if (doi != 0 && doi != default_doi) {
4578 4588                          zone_free(zone);
4579 4589                          return (set_errno(EINVAL));
4580 4590                  }
4581 4591                  /* Always apply system's doi to the zone */
4582 4592                  error = zone_set_label(zone, label, default_doi);
4583 4593                  if (error != 0) {
4584 4594                          zone_free(zone);
4585 4595                          return (set_errno(error));
4586 4596                  }
4587 4597                  insert_label_hash = B_TRUE;
4588 4598          } else {
4589 4599                  /* all zones get an admin_low label if system is not labeled */
4590 4600                  zone->zone_slabel = l_admin_low;
4591 4601                  label_hold(l_admin_low);
4592 4602                  insert_label_hash = B_FALSE;
4593 4603          }
4594 4604  
4595 4605          /*
4596 4606           * Stop all lwps since that's what normally happens as part of fork().
4597 4607           * This needs to happen before we grab any locks to avoid deadlock
4598 4608           * (another lwp in the process could be waiting for the held lock).
4599 4609           */
4600 4610          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4601 4611                  zone_free(zone);
4602 4612                  nvlist_free(rctls);
4603 4613                  return (zone_create_error(error, 0, extended_error));
4604 4614          }
4605 4615  
4606 4616          if (block_mounts(zone) == 0) {
4607 4617                  mutex_enter(&pp->p_lock);
4608 4618                  if (curthread != pp->p_agenttp)
4609 4619                          continuelwps(pp);
4610 4620                  mutex_exit(&pp->p_lock);
4611 4621                  zone_free(zone);
4612 4622                  nvlist_free(rctls);
4613 4623                  return (zone_create_error(error, 0, extended_error));
4614 4624          }
4615 4625  
4616 4626          /*
4617 4627           * Set up credential for kernel access.  After this, any errors
4618 4628           * should go through the dance in errout rather than calling
4619 4629           * zone_free directly.
4620 4630           */
4621 4631          zone->zone_kcred = crdup(kcred);
4622 4632          crsetzone(zone->zone_kcred, zone);
4623 4633          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4624 4634          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4625 4635          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4626 4636          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4627 4637  
4628 4638          mutex_enter(&zonehash_lock);
4629 4639          /*
4630 4640           * Make sure zone doesn't already exist.
4631 4641           *
4632 4642           * If the system and zone are labeled,
4633 4643           * make sure no other zone exists that has the same label.
4634 4644           */
4635 4645          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4636 4646              (insert_label_hash &&
4637 4647              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4638 4648                  zone_status_t status;
4639 4649  
4640 4650                  status = zone_status_get(ztmp);
4641 4651                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4642 4652                          error = EEXIST;
4643 4653                  else
4644 4654                          error = EBUSY;
4645 4655  
4646 4656                  if (insert_label_hash)
4647 4657                          error2 = ZE_LABELINUSE;
4648 4658  
4649 4659                  goto errout;
4650 4660          }
4651 4661  
4652 4662          /*
4653 4663           * Don't allow zone creations which would cause one zone's rootpath to
4654 4664           * be accessible from that of another (non-global) zone.
4655 4665           */
4656 4666          if (zone_is_nested(zone->zone_rootpath)) {
4657 4667                  error = EBUSY;
4658 4668                  goto errout;
4659 4669          }
4660 4670  
4661 4671          ASSERT(zonecount != 0);         /* check for leaks */
4662 4672          if (zonecount + 1 > maxzones) {
4663 4673                  error = ENOMEM;
4664 4674                  goto errout;
4665 4675          }
4666 4676  
4667 4677          if (zone_mount_count(zone->zone_rootpath) != 0) {
4668 4678                  error = EBUSY;
4669 4679                  error2 = ZE_AREMOUNTS;
4670 4680                  goto errout;
4671 4681          }
4672 4682  
4673 4683          /*
4674 4684           * Zone is still incomplete, but we need to drop all locks while
4675 4685           * zsched() initializes this zone's kernel process.  We
4676 4686           * optimistically add the zone to the hashtable and associated
4677 4687           * lists so a parallel zone_create() doesn't try to create the
4678 4688           * same zone.
4679 4689           */
4680 4690          zonecount++;
4681 4691          (void) mod_hash_insert(zonehashbyid,
4682 4692              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4683 4693              (mod_hash_val_t)(uintptr_t)zone);
4684 4694          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4685 4695          (void) strcpy(str, zone->zone_name);
4686 4696          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4687 4697              (mod_hash_val_t)(uintptr_t)zone);
4688 4698          if (insert_label_hash) {
4689 4699                  (void) mod_hash_insert(zonehashbylabel,
4690 4700                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4691 4701                  zone->zone_flags |= ZF_HASHED_LABEL;
4692 4702          }
4693 4703  
4694 4704          /*
4695 4705           * Insert into active list.  At this point there are no 'hold's
4696 4706           * on the zone, but everyone else knows not to use it, so we can
4697 4707           * continue to use it.  zsched() will do a zone_hold() if the
4698 4708           * newproc() is successful.
4699 4709           */
4700 4710          list_insert_tail(&zone_active, zone);
4701 4711          mutex_exit(&zonehash_lock);
4702 4712  
4703 4713          zarg.zone = zone;
4704 4714          zarg.nvlist = rctls;
4705 4715          /*
4706 4716           * The process, task, and project rctls are probably wrong;
4707 4717           * we need an interface to get the default values of all rctls,
4708 4718           * and initialize zsched appropriately.  I'm not sure that that
4709 4719           * makes much of a difference, though.
4710 4720           */
4711 4721          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4712 4722          if (error != 0) {
4713 4723                  /*
4714 4724                   * We need to undo all globally visible state.
4715 4725                   */
4716 4726                  mutex_enter(&zonehash_lock);
4717 4727                  list_remove(&zone_active, zone);
4718 4728                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4719 4729                          ASSERT(zone->zone_slabel != NULL);
4720 4730                          (void) mod_hash_destroy(zonehashbylabel,
4721 4731                              (mod_hash_key_t)zone->zone_slabel);
4722 4732                  }
4723 4733                  (void) mod_hash_destroy(zonehashbyname,
4724 4734                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4725 4735                  (void) mod_hash_destroy(zonehashbyid,
4726 4736                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4727 4737                  ASSERT(zonecount > 1);
4728 4738                  zonecount--;
4729 4739                  goto errout;
4730 4740          }
4731 4741  
4732 4742          /*
4733 4743           * Zone creation can't fail from now on.
4734 4744           */
4735 4745  
4736 4746          /*
4737 4747           * Create zone kstats
4738 4748           */
4739 4749          zone_kstat_create(zone);
4740 4750  
4741 4751          /*
4742 4752           * Let the other lwps continue.
4743 4753           */
4744 4754          mutex_enter(&pp->p_lock);
4745 4755          if (curthread != pp->p_agenttp)
4746 4756                  continuelwps(pp);
4747 4757          mutex_exit(&pp->p_lock);
4748 4758  
4749 4759          /*
4750 4760           * Wait for zsched to finish initializing the zone.
4751 4761           */
4752 4762          zone_status_wait(zone, ZONE_IS_READY);
4753 4763          /*
4754 4764           * The zone is fully visible, so we can let mounts progress.
4755 4765           */
4756 4766          resume_mounts(zone);
4757 4767          nvlist_free(rctls);
4758 4768  
4759 4769          return (zoneid);
4760 4770  
4761 4771  errout:
4762 4772          mutex_exit(&zonehash_lock);
4763 4773          /*
4764 4774           * Let the other lwps continue.
4765 4775           */
4766 4776          mutex_enter(&pp->p_lock);
4767 4777          if (curthread != pp->p_agenttp)
4768 4778                  continuelwps(pp);
4769 4779          mutex_exit(&pp->p_lock);
4770 4780  
4771 4781          resume_mounts(zone);
4772 4782          nvlist_free(rctls);
4773 4783          /*
4774 4784           * There is currently one reference to the zone, a cred_ref from
4775 4785           * zone_kcred.  To free the zone, we call crfree, which will call
4776 4786           * zone_cred_rele, which will call zone_free.
4777 4787           */
4778 4788          ASSERT(zone->zone_cred_ref == 1);
4779 4789          ASSERT(zone->zone_kcred->cr_ref == 1);
4780 4790          ASSERT(zone->zone_ref == 0);
4781 4791          zkcr = zone->zone_kcred;
4782 4792          zone->zone_kcred = NULL;
4783 4793          crfree(zkcr);                           /* triggers call to zone_free */
4784 4794          return (zone_create_error(error, error2, extended_error));
4785 4795  }
4786 4796  
4787 4797  /*
4788 4798   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4789 4799   * the heavy lifting.  initname is the path to the program to launch
4790 4800   * at the "top" of the zone; if this is NULL, we use the system default,
4791 4801   * which is stored at zone_default_initname.
4792 4802   */
4793 4803  static int
4794 4804  zone_boot(zoneid_t zoneid)
4795 4805  {
4796 4806          int err;
4797 4807          zone_t *zone;
4798 4808  
4799 4809          if (secpolicy_zone_config(CRED()) != 0)
4800 4810                  return (set_errno(EPERM));
4801 4811          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4802 4812                  return (set_errno(EINVAL));
4803 4813  
4804 4814          mutex_enter(&zonehash_lock);
4805 4815          /*
4806 4816           * Look for zone under hash lock to prevent races with calls to
4807 4817           * zone_shutdown, zone_destroy, etc.
4808 4818           */
4809 4819          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4810 4820                  mutex_exit(&zonehash_lock);
4811 4821                  return (set_errno(EINVAL));
4812 4822          }
4813 4823  
4814 4824          mutex_enter(&zone_status_lock);
4815 4825          if (zone_status_get(zone) != ZONE_IS_READY) {
4816 4826                  mutex_exit(&zone_status_lock);
4817 4827                  mutex_exit(&zonehash_lock);
4818 4828                  return (set_errno(EINVAL));
4819 4829          }
4820 4830          zone_status_set(zone, ZONE_IS_BOOTING);
4821 4831          mutex_exit(&zone_status_lock);
4822 4832  
4823 4833          zone_hold(zone);        /* so we can use the zone_t later */
4824 4834          mutex_exit(&zonehash_lock);
4825 4835  
4826 4836          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4827 4837                  zone_rele(zone);
4828 4838                  return (set_errno(EINTR));
4829 4839          }
4830 4840  
4831 4841          /*
4832 4842           * Boot (starting init) might have failed, in which case the zone
4833 4843           * will go to the SHUTTING_DOWN state; an appropriate errno will
4834 4844           * be placed in zone->zone_boot_err, and so we return that.
4835 4845           */
4836 4846          err = zone->zone_boot_err;
4837 4847          zone_rele(zone);
4838 4848          return (err ? set_errno(err) : 0);
4839 4849  }
4840 4850  
4841 4851  /*
4842 4852   * Kills all user processes in the zone, waiting for them all to exit
4843 4853   * before returning.
4844 4854   */
4845 4855  static int
4846 4856  zone_empty(zone_t *zone)
4847 4857  {
4848 4858          int waitstatus;
4849 4859  
4850 4860          /*
4851 4861           * We need to drop zonehash_lock before killing all
4852 4862           * processes, otherwise we'll deadlock with zone_find_*
4853 4863           * which can be called from the exit path.
4854 4864           */
4855 4865          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4856 4866          while ((waitstatus = zone_status_timedwait_sig(zone,
4857 4867              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4858 4868                  killall(zone->zone_id);
4859 4869          }
4860 4870          /*
4861 4871           * return EINTR if we were signaled
4862 4872           */
4863 4873          if (waitstatus == 0)
4864 4874                  return (EINTR);
4865 4875          return (0);
4866 4876  }
4867 4877  
4868 4878  /*
4869 4879   * This function implements the policy for zone visibility.
4870 4880   *
4871 4881   * In standard Solaris, a non-global zone can only see itself.
4872 4882   *
4873 4883   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4874 4884   * it dominates. For this test, the label of the global zone is treated as
4875 4885   * admin_high so it is special-cased instead of being checked for dominance.
4876 4886   *
4877 4887   * Returns true if zone attributes are viewable, false otherwise.
4878 4888   */
4879 4889  static boolean_t
4880 4890  zone_list_access(zone_t *zone)
4881 4891  {
4882 4892  
4883 4893          if (curproc->p_zone == global_zone ||
4884 4894              curproc->p_zone == zone) {
4885 4895                  return (B_TRUE);
4886 4896          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4887 4897                  bslabel_t *curproc_label;
4888 4898                  bslabel_t *zone_label;
4889 4899  
4890 4900                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4891 4901                  zone_label = label2bslabel(zone->zone_slabel);
4892 4902  
4893 4903                  if (zone->zone_id != GLOBAL_ZONEID &&
4894 4904                      bldominates(curproc_label, zone_label)) {
4895 4905                          return (B_TRUE);
4896 4906                  } else {
4897 4907                          return (B_FALSE);
4898 4908                  }
4899 4909          } else {
4900 4910                  return (B_FALSE);
4901 4911          }
4902 4912  }
4903 4913  
4904 4914  /*
4905 4915   * Systemcall to start the zone's halt sequence.  By the time this
4906 4916   * function successfully returns, all user processes and kernel threads
4907 4917   * executing in it will have exited, ZSD shutdown callbacks executed,
4908 4918   * and the zone status set to ZONE_IS_DOWN.
4909 4919   *
4910 4920   * It is possible that the call will interrupt itself if the caller is the
4911 4921   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4912 4922   */
4913 4923  static int
4914 4924  zone_shutdown(zoneid_t zoneid)
4915 4925  {
4916 4926          int error;
4917 4927          zone_t *zone;
4918 4928          zone_status_t status;
4919 4929  
4920 4930          if (secpolicy_zone_config(CRED()) != 0)
4921 4931                  return (set_errno(EPERM));
4922 4932          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4923 4933                  return (set_errno(EINVAL));
4924 4934  
4925 4935          mutex_enter(&zonehash_lock);
4926 4936          /*
4927 4937           * Look for zone under hash lock to prevent races with other
4928 4938           * calls to zone_shutdown and zone_destroy.
4929 4939           */
4930 4940          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4931 4941                  mutex_exit(&zonehash_lock);
4932 4942                  return (set_errno(EINVAL));
4933 4943          }
4934 4944  
4935 4945          /*
4936 4946           * We have to drop zonehash_lock before calling block_mounts.
4937 4947           * Hold the zone so we can continue to use the zone_t.
4938 4948           */
4939 4949          zone_hold(zone);
4940 4950          mutex_exit(&zonehash_lock);
4941 4951  
4942 4952          /*
4943 4953           * Block mounts so that VFS_MOUNT() can get an accurate view of
4944 4954           * the zone's status with regards to ZONE_IS_SHUTTING down.
4945 4955           *
4946 4956           * e.g. NFS can fail the mount if it determines that the zone
4947 4957           * has already begun the shutdown sequence.
4948 4958           *
4949 4959           */
4950 4960          if (block_mounts(zone) == 0) {
4951 4961                  zone_rele(zone);
4952 4962                  return (set_errno(EINTR));
4953 4963          }
4954 4964  
4955 4965          mutex_enter(&zonehash_lock);
4956 4966          mutex_enter(&zone_status_lock);
4957 4967          status = zone_status_get(zone);
4958 4968          /*
4959 4969           * Fail if the zone isn't fully initialized yet.
4960 4970           */
4961 4971          if (status < ZONE_IS_READY) {
4962 4972                  mutex_exit(&zone_status_lock);
4963 4973                  mutex_exit(&zonehash_lock);
4964 4974                  resume_mounts(zone);
4965 4975                  zone_rele(zone);
4966 4976                  return (set_errno(EINVAL));
4967 4977          }
4968 4978          /*
4969 4979           * If conditions required for zone_shutdown() to return have been met,
4970 4980           * return success.
4971 4981           */
4972 4982          if (status >= ZONE_IS_DOWN) {
4973 4983                  mutex_exit(&zone_status_lock);
4974 4984                  mutex_exit(&zonehash_lock);
4975 4985                  resume_mounts(zone);
4976 4986                  zone_rele(zone);
4977 4987                  return (0);
4978 4988          }
4979 4989          /*
4980 4990           * If zone_shutdown() hasn't been called before, go through the motions.
4981 4991           * If it has, there's nothing to do but wait for the kernel threads to
4982 4992           * drain.
4983 4993           */
4984 4994          if (status < ZONE_IS_EMPTY) {
4985 4995                  uint_t ntasks;
4986 4996  
4987 4997                  mutex_enter(&zone->zone_lock);
4988 4998                  if ((ntasks = zone->zone_ntasks) != 1) {
4989 4999                          /*
4990 5000                           * There's still stuff running.
4991 5001                           */
4992 5002                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4993 5003                  }
4994 5004                  mutex_exit(&zone->zone_lock);
4995 5005                  if (ntasks == 1) {
4996 5006                          /*
4997 5007                           * The only way to create another task is through
4998 5008                           * zone_enter(), which will block until we drop
4999 5009                           * zonehash_lock.  The zone is empty.
5000 5010                           */
5001 5011                          if (zone->zone_kthreads == NULL) {
5002 5012                                  /*
5003 5013                                   * Skip ahead to ZONE_IS_DOWN
5004 5014                                   */
5005 5015                                  zone_status_set(zone, ZONE_IS_DOWN);
5006 5016                          } else {
5007 5017                                  zone_status_set(zone, ZONE_IS_EMPTY);
5008 5018                          }
5009 5019                  }
5010 5020          }
5011 5021          mutex_exit(&zone_status_lock);
5012 5022          mutex_exit(&zonehash_lock);
5013 5023          resume_mounts(zone);
5014 5024  
5015 5025          if (error = zone_empty(zone)) {
5016 5026                  zone_rele(zone);
5017 5027                  return (set_errno(error));
5018 5028          }
5019 5029          /*
5020 5030           * After the zone status goes to ZONE_IS_DOWN this zone will no
5021 5031           * longer be notified of changes to the pools configuration, so
5022 5032           * in order to not end up with a stale pool pointer, we point
5023 5033           * ourselves at the default pool and remove all resource
5024 5034           * visibility.  This is especially important as the zone_t may
5025 5035           * languish on the deathrow for a very long time waiting for
5026 5036           * cred's to drain out.
5027 5037           *
5028 5038           * This rebinding of the zone can happen multiple times
5029 5039           * (presumably due to interrupted or parallel systemcalls)
5030 5040           * without any adverse effects.
5031 5041           */
5032 5042          if (pool_lock_intr() != 0) {
5033 5043                  zone_rele(zone);
5034 5044                  return (set_errno(EINTR));
5035 5045          }
5036 5046          if (pool_state == POOL_ENABLED) {
5037 5047                  mutex_enter(&cpu_lock);
5038 5048                  zone_pool_set(zone, pool_default);
5039 5049                  /*
5040 5050                   * The zone no longer needs to be able to see any cpus.
5041 5051                   */
5042 5052                  zone_pset_set(zone, ZONE_PS_INVAL);
5043 5053                  mutex_exit(&cpu_lock);
5044 5054          }
5045 5055          pool_unlock();
5046 5056  
5047 5057          /*
5048 5058           * ZSD shutdown callbacks can be executed multiple times, hence
5049 5059           * it is safe to not be holding any locks across this call.
5050 5060           */
5051 5061          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5052 5062  
5053 5063          mutex_enter(&zone_status_lock);
5054 5064          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5055 5065                  zone_status_set(zone, ZONE_IS_DOWN);
5056 5066          mutex_exit(&zone_status_lock);
5057 5067  
5058 5068          /*
5059 5069           * Wait for kernel threads to drain.
5060 5070           */
5061 5071          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5062 5072                  zone_rele(zone);
5063 5073                  return (set_errno(EINTR));
5064 5074          }
5065 5075  
5066 5076          /*
5067 5077           * Zone can be become down/destroyable even if the above wait
5068 5078           * returns EINTR, so any code added here may never execute.
5069 5079           * (i.e. don't add code here)
5070 5080           */
5071 5081  
5072 5082          zone_rele(zone);
5073 5083          return (0);
5074 5084  }
5075 5085  
5076 5086  /*
5077 5087   * Log the specified zone's reference counts.  The caller should not be
5078 5088   * holding the zone's zone_lock.
5079 5089   */
5080 5090  static void
5081 5091  zone_log_refcounts(zone_t *zone)
5082 5092  {
5083 5093          char *buffer;
5084 5094          char *buffer_position;
5085 5095          uint32_t buffer_size;
5086 5096          uint32_t index;
5087 5097          uint_t ref;
5088 5098          uint_t cred_ref;
5089 5099  
5090 5100          /*
5091 5101           * Construct a string representing the subsystem-specific reference
5092 5102           * counts.  The counts are printed in ascending order by index into the
5093 5103           * zone_t::zone_subsys_ref array.  The list will be surrounded by
5094 5104           * square brackets [] and will only contain nonzero reference counts.
5095 5105           *
5096 5106           * The buffer will hold two square bracket characters plus ten digits,
5097 5107           * one colon, one space, one comma, and some characters for a
5098 5108           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5099 5109           * bit integers have at most ten decimal digits.)  The last
5100 5110           * reference count's comma is replaced by the closing square
5101 5111           * bracket and a NULL character to terminate the string.
5102 5112           *
5103 5113           * NOTE: We have to grab the zone's zone_lock to create a consistent
5104 5114           * snapshot of the zone's reference counters.
5105 5115           *
5106 5116           * First, figure out how much space the string buffer will need.
5107 5117           * The buffer's size is stored in buffer_size.
5108 5118           */
5109 5119          buffer_size = 2;                        /* for the square brackets */
5110 5120          mutex_enter(&zone->zone_lock);
5111 5121          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5112 5122          ref = zone->zone_ref;
5113 5123          cred_ref = zone->zone_cred_ref;
5114 5124          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5115 5125                  if (zone->zone_subsys_ref[index] != 0)
5116 5126                          buffer_size += strlen(zone_ref_subsys_names[index]) +
5117 5127                              13;
5118 5128          if (buffer_size == 2) {
5119 5129                  /*
5120 5130                   * No subsystems had nonzero reference counts.  Don't bother
5121 5131                   * with allocating a buffer; just log the general-purpose and
5122 5132                   * credential reference counts.
5123 5133                   */
5124 5134                  mutex_exit(&zone->zone_lock);
5125 5135                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5126 5136                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
5127 5137                      "references and %u credential references are still extant",
5128 5138                      zone->zone_name, zone->zone_id, ref, cred_ref);
5129 5139                  return;
5130 5140          }
5131 5141  
5132 5142          /*
5133 5143           * buffer_size contains the exact number of characters that the
5134 5144           * buffer will need.  Allocate the buffer and fill it with nonzero
5135 5145           * subsystem-specific reference counts.  Surround the results with
5136 5146           * square brackets afterwards.
5137 5147           */
5138 5148          buffer = kmem_alloc(buffer_size, KM_SLEEP);
5139 5149          buffer_position = &buffer[1];
5140 5150          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5141 5151                  /*
5142 5152                   * NOTE: The DDI's version of sprintf() returns a pointer to
5143 5153                   * the modified buffer rather than the number of bytes written
5144 5154                   * (as in snprintf(3C)).  This is unfortunate and annoying.
5145 5155                   * Therefore, we'll use snprintf() with INT_MAX to get the
5146 5156                   * number of bytes written.  Using INT_MAX is safe because
5147 5157                   * the buffer is perfectly sized for the data: we'll never
5148 5158                   * overrun the buffer.
5149 5159                   */
5150 5160                  if (zone->zone_subsys_ref[index] != 0)
5151 5161                          buffer_position += snprintf(buffer_position, INT_MAX,
5152 5162                              "%s: %u,", zone_ref_subsys_names[index],
5153 5163                              zone->zone_subsys_ref[index]);
5154 5164          }
5155 5165          mutex_exit(&zone->zone_lock);
5156 5166          buffer[0] = '[';
5157 5167          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5158 5168          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5159 5169          buffer_position[-1] = ']';
5160 5170  
5161 5171          /*
5162 5172           * Log the reference counts and free the message buffer.
5163 5173           */
5164 5174          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5165 5175              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5166 5176              "%u credential references are still extant %s", zone->zone_name,
5167 5177              zone->zone_id, ref, cred_ref, buffer);
5168 5178          kmem_free(buffer, buffer_size);
5169 5179  }
5170 5180  
5171 5181  /*
5172 5182   * Systemcall entry point to finalize the zone halt process.  The caller
5173 5183   * must have already successfully called zone_shutdown().
5174 5184   *
5175 5185   * Upon successful completion, the zone will have been fully destroyed:
5176 5186   * zsched will have exited, destructor callbacks executed, and the zone
5177 5187   * removed from the list of active zones.
5178 5188   */
5179 5189  static int
5180 5190  zone_destroy(zoneid_t zoneid)
5181 5191  {
5182 5192          uint64_t uniqid;
5183 5193          zone_t *zone;
5184 5194          zone_status_t status;
5185 5195          clock_t wait_time;
5186 5196          boolean_t log_refcounts;
5187 5197  
5188 5198          if (secpolicy_zone_config(CRED()) != 0)
5189 5199                  return (set_errno(EPERM));
5190 5200          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5191 5201                  return (set_errno(EINVAL));
5192 5202  
5193 5203          mutex_enter(&zonehash_lock);
5194 5204          /*
5195 5205           * Look for zone under hash lock to prevent races with other
5196 5206           * calls to zone_destroy.
5197 5207           */
5198 5208          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5199 5209                  mutex_exit(&zonehash_lock);
5200 5210                  return (set_errno(EINVAL));
5201 5211          }
5202 5212  
5203 5213          if (zone_mount_count(zone->zone_rootpath) != 0) {
5204 5214                  mutex_exit(&zonehash_lock);
5205 5215                  return (set_errno(EBUSY));
5206 5216          }
5207 5217          mutex_enter(&zone_status_lock);
5208 5218          status = zone_status_get(zone);
5209 5219          if (status < ZONE_IS_DOWN) {
5210 5220                  mutex_exit(&zone_status_lock);
5211 5221                  mutex_exit(&zonehash_lock);
5212 5222                  return (set_errno(EBUSY));
5213 5223          } else if (status == ZONE_IS_DOWN) {
5214 5224                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5215 5225          }
5216 5226          mutex_exit(&zone_status_lock);
5217 5227          zone_hold(zone);
5218 5228          mutex_exit(&zonehash_lock);
5219 5229  
5220 5230          /*
5221 5231           * wait for zsched to exit
5222 5232           */
5223 5233          zone_status_wait(zone, ZONE_IS_DEAD);
5224 5234          zone_zsd_callbacks(zone, ZSD_DESTROY);
5225 5235          zone->zone_netstack = NULL;
5226 5236          uniqid = zone->zone_uniqid;
5227 5237          zone_rele(zone);
5228 5238          zone = NULL;    /* potentially free'd */
5229 5239  
5230 5240          log_refcounts = B_FALSE;
5231 5241          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5232 5242          mutex_enter(&zonehash_lock);
5233 5243          for (; /* ever */; ) {
5234 5244                  boolean_t unref;
5235 5245                  boolean_t refs_have_been_logged;
5236 5246  
5237 5247                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5238 5248                      zone->zone_uniqid != uniqid) {
5239 5249                          /*
5240 5250                           * The zone has gone away.  Necessary conditions
5241 5251                           * are met, so we return success.
5242 5252                           */
5243 5253                          mutex_exit(&zonehash_lock);
5244 5254                          return (0);
5245 5255                  }
5246 5256                  mutex_enter(&zone->zone_lock);
5247 5257                  unref = ZONE_IS_UNREF(zone);
5248 5258                  refs_have_been_logged = (zone->zone_flags &
5249 5259                      ZF_REFCOUNTS_LOGGED);
5250 5260                  mutex_exit(&zone->zone_lock);
5251 5261                  if (unref) {
5252 5262                          /*
5253 5263                           * There is only one reference to the zone -- that
5254 5264                           * added when the zone was added to the hashtables --
5255 5265                           * and things will remain this way until we drop
5256 5266                           * zonehash_lock... we can go ahead and cleanup the
5257 5267                           * zone.
5258 5268                           */
5259 5269                          break;
5260 5270                  }
5261 5271  
5262 5272                  /*
5263 5273                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5264 5274                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5265 5275                   * some zone's general-purpose reference count reaches one.
5266 5276                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5267 5277                   * on zone_destroy_cv, then log the zone's reference counts and
5268 5278                   * continue to wait for zone_rele() and zone_cred_rele().
5269 5279                   */
5270 5280                  if (!refs_have_been_logged) {
5271 5281                          if (!log_refcounts) {
5272 5282                                  /*
5273 5283                                   * This thread hasn't timed out waiting on
5274 5284                                   * zone_destroy_cv yet.  Wait wait_time clock
5275 5285                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5276 5286                                   * seconds) for the zone's references to clear.
5277 5287                                   */
5278 5288                                  ASSERT(wait_time > 0);
5279 5289                                  wait_time = cv_reltimedwait_sig(
5280 5290                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5281 5291                                      TR_SEC);
5282 5292                                  if (wait_time > 0) {
5283 5293                                          /*
5284 5294                                           * A thread in zone_rele() or
5285 5295                                           * zone_cred_rele() signaled
5286 5296                                           * zone_destroy_cv before this thread's
5287 5297                                           * wait timed out.  The zone might have
5288 5298                                           * only one reference left; find out!
5289 5299                                           */
5290 5300                                          continue;
5291 5301                                  } else if (wait_time == 0) {
5292 5302                                          /* The thread's process was signaled. */
5293 5303                                          mutex_exit(&zonehash_lock);
5294 5304                                          return (set_errno(EINTR));
5295 5305                                  }
5296 5306  
5297 5307                                  /*
5298 5308                                   * The thread timed out while waiting on
5299 5309                                   * zone_destroy_cv.  Even though the thread
5300 5310                                   * timed out, it has to check whether another
5301 5311                                   * thread woke up from zone_destroy_cv and
5302 5312                                   * destroyed the zone.
5303 5313                                   *
5304 5314                                   * If the zone still exists and has more than
5305 5315                                   * one unreleased general-purpose reference,
5306 5316                                   * then log the zone's reference counts.
5307 5317                                   */
5308 5318                                  log_refcounts = B_TRUE;
5309 5319                                  continue;
5310 5320                          }
5311 5321  
5312 5322                          /*
5313 5323                           * The thread already timed out on zone_destroy_cv while
5314 5324                           * waiting for subsystems to release the zone's last
5315 5325                           * general-purpose references.  Log the zone's reference
5316 5326                           * counts and wait indefinitely on zone_destroy_cv.
5317 5327                           */
5318 5328                          zone_log_refcounts(zone);
5319 5329                  }
5320 5330                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5321 5331                          /* The thread's process was signaled. */
5322 5332                          mutex_exit(&zonehash_lock);
5323 5333                          return (set_errno(EINTR));
5324 5334                  }
5325 5335          }
5326 5336  
5327 5337          /*
5328 5338           * Remove CPU cap for this zone now since we're not going to
5329 5339           * fail below this point.
5330 5340           */
5331 5341          cpucaps_zone_remove(zone);
5332 5342  
5333 5343          /* Get rid of the zone's kstats */
5334 5344          zone_kstat_delete(zone);
5335 5345  
5336 5346          /* remove the pfexecd doors */
5337 5347          if (zone->zone_pfexecd != NULL) {
5338 5348                  klpd_freelist(&zone->zone_pfexecd);
5339 5349                  zone->zone_pfexecd = NULL;
5340 5350          }
5341 5351  
5342 5352          /* free brand specific data */
5343 5353          if (ZONE_IS_BRANDED(zone))
5344 5354                  ZBROP(zone)->b_free_brand_data(zone);
5345 5355  
5346 5356          /* Say goodbye to brand framework. */
5347 5357          brand_unregister_zone(zone->zone_brand);
5348 5358  
5349 5359          /*
5350 5360           * It is now safe to let the zone be recreated; remove it from the
5351 5361           * lists.  The memory will not be freed until the last cred
5352 5362           * reference goes away.
5353 5363           */
5354 5364          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5355 5365          zonecount--;
5356 5366          /* remove from active list and hash tables */
5357 5367          list_remove(&zone_active, zone);
5358 5368          (void) mod_hash_destroy(zonehashbyname,
5359 5369              (mod_hash_key_t)zone->zone_name);
5360 5370          (void) mod_hash_destroy(zonehashbyid,
5361 5371              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5362 5372          if (zone->zone_flags & ZF_HASHED_LABEL)
5363 5373                  (void) mod_hash_destroy(zonehashbylabel,
5364 5374                      (mod_hash_key_t)zone->zone_slabel);
5365 5375          mutex_exit(&zonehash_lock);
5366 5376  
5367 5377          /*
5368 5378           * Release the root vnode; we're not using it anymore.  Nor should any
5369 5379           * other thread that might access it exist.
5370 5380           */
5371 5381          if (zone->zone_rootvp != NULL) {
5372 5382                  VN_RELE(zone->zone_rootvp);
5373 5383                  zone->zone_rootvp = NULL;
5374 5384          }
5375 5385  
5376 5386          /* add to deathrow list */
5377 5387          mutex_enter(&zone_deathrow_lock);
5378 5388          list_insert_tail(&zone_deathrow, zone);
5379 5389          mutex_exit(&zone_deathrow_lock);
5380 5390  
5381 5391          /*
5382 5392           * Drop last reference (which was added by zsched()), this will
5383 5393           * free the zone unless there are outstanding cred references.
5384 5394           */
5385 5395          zone_rele(zone);
5386 5396          return (0);
5387 5397  }
5388 5398  
5389 5399  /*
5390 5400   * Systemcall entry point for zone_getattr(2).
5391 5401   */
5392 5402  static ssize_t
5393 5403  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5394 5404  {
5395 5405          size_t size;
5396 5406          int error = 0, err;
5397 5407          zone_t *zone;
5398 5408          char *zonepath;
5399 5409          char *outstr;
5400 5410          zone_status_t zone_status;
5401 5411          pid_t initpid;
5402 5412          boolean_t global = (curzone == global_zone);
5403 5413          boolean_t inzone = (curzone->zone_id == zoneid);
5404 5414          ushort_t flags;
5405 5415          zone_net_data_t *zbuf;
5406 5416  
5407 5417          mutex_enter(&zonehash_lock);
5408 5418          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5409 5419                  mutex_exit(&zonehash_lock);
5410 5420                  return (set_errno(EINVAL));
5411 5421          }
5412 5422          zone_status = zone_status_get(zone);
5413 5423          if (zone_status < ZONE_IS_INITIALIZED) {
5414 5424                  mutex_exit(&zonehash_lock);
5415 5425                  return (set_errno(EINVAL));
5416 5426          }
5417 5427          zone_hold(zone);
5418 5428          mutex_exit(&zonehash_lock);
5419 5429  
5420 5430          /*
5421 5431           * If not in the global zone, don't show information about other zones,
5422 5432           * unless the system is labeled and the local zone's label dominates
5423 5433           * the other zone.
5424 5434           */
5425 5435          if (!zone_list_access(zone)) {
5426 5436                  zone_rele(zone);
5427 5437                  return (set_errno(EINVAL));
5428 5438          }
5429 5439  
5430 5440          switch (attr) {
5431 5441          case ZONE_ATTR_ROOT:
5432 5442                  if (global) {
5433 5443                          /*
5434 5444                           * Copy the path to trim the trailing "/" (except for
5435 5445                           * the global zone).
5436 5446                           */
5437 5447                          if (zone != global_zone)
5438 5448                                  size = zone->zone_rootpathlen - 1;
5439 5449                          else
5440 5450                                  size = zone->zone_rootpathlen;
5441 5451                          zonepath = kmem_alloc(size, KM_SLEEP);
5442 5452                          bcopy(zone->zone_rootpath, zonepath, size);
5443 5453                          zonepath[size - 1] = '\0';
5444 5454                  } else {
5445 5455                          if (inzone || !is_system_labeled()) {
5446 5456                                  /*
5447 5457                                   * Caller is not in the global zone.
5448 5458                                   * if the query is on the current zone
5449 5459                                   * or the system is not labeled,
5450 5460                                   * just return faked-up path for current zone.
5451 5461                                   */
5452 5462                                  zonepath = "/";
5453 5463                                  size = 2;
5454 5464                          } else {
5455 5465                                  /*
5456 5466                                   * Return related path for current zone.
5457 5467                                   */
5458 5468                                  int prefix_len = strlen(zone_prefix);
5459 5469                                  int zname_len = strlen(zone->zone_name);
5460 5470  
5461 5471                                  size = prefix_len + zname_len + 1;
5462 5472                                  zonepath = kmem_alloc(size, KM_SLEEP);
5463 5473                                  bcopy(zone_prefix, zonepath, prefix_len);
5464 5474                                  bcopy(zone->zone_name, zonepath +
5465 5475                                      prefix_len, zname_len);
5466 5476                                  zonepath[size - 1] = '\0';
5467 5477                          }
5468 5478                  }
5469 5479                  if (bufsize > size)
5470 5480                          bufsize = size;
5471 5481                  if (buf != NULL) {
5472 5482                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5473 5483                          if (err != 0 && err != ENAMETOOLONG)
5474 5484                                  error = EFAULT;
5475 5485                  }
5476 5486                  if (global || (is_system_labeled() && !inzone))
5477 5487                          kmem_free(zonepath, size);
5478 5488                  break;
5479 5489  
5480 5490          case ZONE_ATTR_NAME:
5481 5491                  size = strlen(zone->zone_name) + 1;
5482 5492                  if (bufsize > size)
5483 5493                          bufsize = size;
5484 5494                  if (buf != NULL) {
5485 5495                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5486 5496                          if (err != 0 && err != ENAMETOOLONG)
5487 5497                                  error = EFAULT;
5488 5498                  }
5489 5499                  break;
5490 5500  
5491 5501          case ZONE_ATTR_STATUS:
5492 5502                  /*
5493 5503                   * Since we're not holding zonehash_lock, the zone status
5494 5504                   * may be anything; leave it up to userland to sort it out.
5495 5505                   */
5496 5506                  size = sizeof (zone_status);
5497 5507                  if (bufsize > size)
5498 5508                          bufsize = size;
5499 5509                  zone_status = zone_status_get(zone);
5500 5510                  if (buf != NULL &&
5501 5511                      copyout(&zone_status, buf, bufsize) != 0)
5502 5512                          error = EFAULT;
5503 5513                  break;
5504 5514          case ZONE_ATTR_FLAGS:
5505 5515                  size = sizeof (zone->zone_flags);
5506 5516                  if (bufsize > size)
5507 5517                          bufsize = size;
5508 5518                  flags = zone->zone_flags;
5509 5519                  if (buf != NULL &&
5510 5520                      copyout(&flags, buf, bufsize) != 0)
5511 5521                          error = EFAULT;
5512 5522                  break;
5513 5523          case ZONE_ATTR_PRIVSET:
5514 5524                  size = sizeof (priv_set_t);
5515 5525                  if (bufsize > size)
5516 5526                          bufsize = size;
5517 5527                  if (buf != NULL &&
5518 5528                      copyout(zone->zone_privset, buf, bufsize) != 0)
5519 5529                          error = EFAULT;
5520 5530                  break;
5521 5531          case ZONE_ATTR_UNIQID:
5522 5532                  size = sizeof (zone->zone_uniqid);
5523 5533                  if (bufsize > size)
5524 5534                          bufsize = size;
5525 5535                  if (buf != NULL &&
5526 5536                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5527 5537                          error = EFAULT;
5528 5538                  break;
5529 5539          case ZONE_ATTR_POOLID:
5530 5540                  {
5531 5541                          pool_t *pool;
5532 5542                          poolid_t poolid;
5533 5543  
5534 5544                          if (pool_lock_intr() != 0) {
5535 5545                                  error = EINTR;
5536 5546                                  break;
5537 5547                          }
5538 5548                          pool = zone_pool_get(zone);
5539 5549                          poolid = pool->pool_id;
5540 5550                          pool_unlock();
5541 5551                          size = sizeof (poolid);
5542 5552                          if (bufsize > size)
5543 5553                                  bufsize = size;
5544 5554                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5545 5555                                  error = EFAULT;
5546 5556                  }
5547 5557                  break;
5548 5558          case ZONE_ATTR_SLBL:
5549 5559                  size = sizeof (bslabel_t);
5550 5560                  if (bufsize > size)
5551 5561                          bufsize = size;
5552 5562                  if (zone->zone_slabel == NULL)
5553 5563                          error = EINVAL;
5554 5564                  else if (buf != NULL &&
5555 5565                      copyout(label2bslabel(zone->zone_slabel), buf,
5556 5566                      bufsize) != 0)
5557 5567                          error = EFAULT;
5558 5568                  break;
5559 5569          case ZONE_ATTR_INITPID:
5560 5570                  size = sizeof (initpid);
5561 5571                  if (bufsize > size)
5562 5572                          bufsize = size;
5563 5573                  initpid = zone->zone_proc_initpid;
5564 5574                  if (initpid == -1) {
5565 5575                          error = ESRCH;
5566 5576                          break;
5567 5577                  }
5568 5578                  if (buf != NULL &&
5569 5579                      copyout(&initpid, buf, bufsize) != 0)
5570 5580                          error = EFAULT;
5571 5581                  break;
5572 5582          case ZONE_ATTR_BRAND:
5573 5583                  size = strlen(zone->zone_brand->b_name) + 1;
5574 5584  
5575 5585                  if (bufsize > size)
5576 5586                          bufsize = size;
5577 5587                  if (buf != NULL) {
5578 5588                          err = copyoutstr(zone->zone_brand->b_name, buf,
5579 5589                              bufsize, NULL);
5580 5590                          if (err != 0 && err != ENAMETOOLONG)
5581 5591                                  error = EFAULT;
5582 5592                  }
5583 5593                  break;
5584 5594          case ZONE_ATTR_INITNAME:
5585 5595                  size = strlen(zone->zone_initname) + 1;
5586 5596                  if (bufsize > size)
5587 5597                          bufsize = size;
5588 5598                  if (buf != NULL) {
5589 5599                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5590 5600                              NULL);
5591 5601                          if (err != 0 && err != ENAMETOOLONG)
5592 5602                                  error = EFAULT;
5593 5603                  }
5594 5604                  break;
5595 5605          case ZONE_ATTR_BOOTARGS:
5596 5606                  if (zone->zone_bootargs == NULL)
5597 5607                          outstr = "";
5598 5608                  else
5599 5609                          outstr = zone->zone_bootargs;
5600 5610                  size = strlen(outstr) + 1;
5601 5611                  if (bufsize > size)
5602 5612                          bufsize = size;
5603 5613                  if (buf != NULL) {
5604 5614                          err = copyoutstr(outstr, buf, bufsize, NULL);
5605 5615                          if (err != 0 && err != ENAMETOOLONG)
5606 5616                                  error = EFAULT;
5607 5617                  }
5608 5618                  break;
5609 5619          case ZONE_ATTR_PHYS_MCAP:
5610 5620                  size = sizeof (zone->zone_phys_mcap);
5611 5621                  if (bufsize > size)
5612 5622                          bufsize = size;
5613 5623                  if (buf != NULL &&
5614 5624                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5615 5625                          error = EFAULT;
5616 5626                  break;
5617 5627          case ZONE_ATTR_SCHED_CLASS:
5618 5628                  mutex_enter(&class_lock);
5619 5629  
5620 5630                  if (zone->zone_defaultcid >= loaded_classes)
5621 5631                          outstr = "";
5622 5632                  else
5623 5633                          outstr = sclass[zone->zone_defaultcid].cl_name;
5624 5634                  size = strlen(outstr) + 1;
5625 5635                  if (bufsize > size)
5626 5636                          bufsize = size;
5627 5637                  if (buf != NULL) {
5628 5638                          err = copyoutstr(outstr, buf, bufsize, NULL);
5629 5639                          if (err != 0 && err != ENAMETOOLONG)
5630 5640                                  error = EFAULT;
5631 5641                  }
5632 5642  
5633 5643                  mutex_exit(&class_lock);
5634 5644                  break;
5635 5645          case ZONE_ATTR_HOSTID:
5636 5646                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5637 5647                      bufsize == sizeof (zone->zone_hostid)) {
5638 5648                          size = sizeof (zone->zone_hostid);
5639 5649                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5640 5650                              bufsize) != 0)
5641 5651                                  error = EFAULT;
5642 5652                  } else {
5643 5653                          error = EINVAL;
5644 5654                  }
5645 5655                  break;
5646 5656          case ZONE_ATTR_FS_ALLOWED:
5647 5657                  if (zone->zone_fs_allowed == NULL)
5648 5658                          outstr = "";
5649 5659                  else
5650 5660                          outstr = zone->zone_fs_allowed;
5651 5661                  size = strlen(outstr) + 1;
5652 5662                  if (bufsize > size)
5653 5663                          bufsize = size;
5654 5664                  if (buf != NULL) {
5655 5665                          err = copyoutstr(outstr, buf, bufsize, NULL);
5656 5666                          if (err != 0 && err != ENAMETOOLONG)
5657 5667                                  error = EFAULT;
5658 5668                  }
5659 5669                  break;
5660 5670          case ZONE_ATTR_SECFLAGS:
5661 5671                  size = sizeof (zone->zone_secflags);
5662 5672                  if (bufsize > size)
5663 5673                          bufsize = size;
5664 5674                  if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5665 5675                          error = EFAULT;
5666 5676                  break;
5667 5677          case ZONE_ATTR_NETWORK:
5668 5678                  bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5669 5679                  size = bufsize;

↓ open down ↓

1183 lines elided

↑ open up ↑

5670 5680                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5671 5681                  if (copyin(buf, zbuf, bufsize) != 0) {
5672 5682                          error = EFAULT;
5673 5683                  } else {
5674 5684                          error = zone_get_network(zoneid, zbuf);
5675 5685                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5676 5686                                  error = EFAULT;
5677 5687                  }
5678 5688                  kmem_free(zbuf, bufsize);
5679 5689                  break;
     5690 +        case ZONE_ATTR_DID:
     5691 +                size = sizeof (zoneid_t);
     5692 +                if (bufsize > size)
     5693 +                        bufsize = size;
     5694 +
     5695 +                if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
     5696 +                        error = EFAULT;
     5697 +                break;
5680 5698          default:
5681 5699                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5682 5700                          size = bufsize;
5683 5701                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5684 5702                  } else {
5685 5703                          error = EINVAL;
5686 5704                  }
5687 5705          }
5688 5706          zone_rele(zone);
5689 5707

5690 5708          if (error)
5691 5709                  return (set_errno(error));
5692 5710          return ((ssize_t)size);
5693 5711  }
5694 5712  
5695 5713  /*
5696 5714   * Systemcall entry point for zone_setattr(2).
5697 5715   */
5698 5716  /*ARGSUSED*/
5699 5717  static int
5700 5718  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5701 5719  {
5702 5720          zone_t *zone;
5703 5721          zone_status_t zone_status;
5704 5722          int err = -1;
5705 5723          zone_net_data_t *zbuf;
5706 5724  
5707 5725          if (secpolicy_zone_config(CRED()) != 0)
5708 5726                  return (set_errno(EPERM));
5709 5727  
5710 5728          /*
5711 5729           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5712 5730           * global zone.
5713 5731           */
5714 5732          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5715 5733                  return (set_errno(EINVAL));
5716 5734          }
5717 5735  
5718 5736          mutex_enter(&zonehash_lock);
5719 5737          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5720 5738                  mutex_exit(&zonehash_lock);
5721 5739                  return (set_errno(EINVAL));
5722 5740          }
5723 5741          zone_hold(zone);
5724 5742          mutex_exit(&zonehash_lock);
5725 5743  
5726 5744          /*
5727 5745           * At present most attributes can only be set on non-running,
5728 5746           * non-global zones.
5729 5747           */
5730 5748          zone_status = zone_status_get(zone);
5731 5749          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5732 5750                  err = EINVAL;
5733 5751                  goto done;
5734 5752          }
5735 5753  
5736 5754          switch (attr) {
5737 5755          case ZONE_ATTR_INITNAME:
5738 5756                  err = zone_set_initname(zone, (const char *)buf);
5739 5757                  break;
5740 5758          case ZONE_ATTR_INITNORESTART:
5741 5759                  zone->zone_restart_init = B_FALSE;
5742 5760                  err = 0;
5743 5761                  break;
5744 5762          case ZONE_ATTR_BOOTARGS:
5745 5763                  err = zone_set_bootargs(zone, (const char *)buf);
5746 5764                  break;
5747 5765          case ZONE_ATTR_BRAND:
5748 5766                  err = zone_set_brand(zone, (const char *)buf);
5749 5767                  break;
5750 5768          case ZONE_ATTR_FS_ALLOWED:
5751 5769                  err = zone_set_fs_allowed(zone, (const char *)buf);
5752 5770                  break;
5753 5771          case ZONE_ATTR_SECFLAGS:
5754 5772                  err = zone_set_secflags(zone, (psecflags_t *)buf);
5755 5773                  break;
5756 5774          case ZONE_ATTR_PHYS_MCAP:
5757 5775                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5758 5776                  break;
5759 5777          case ZONE_ATTR_SCHED_CLASS:
5760 5778                  err = zone_set_sched_class(zone, (const char *)buf);
5761 5779                  break;
5762 5780          case ZONE_ATTR_HOSTID:
5763 5781                  if (bufsize == sizeof (zone->zone_hostid)) {
5764 5782                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5765 5783                                  err = 0;
5766 5784                          else
5767 5785                                  err = EFAULT;
5768 5786                  } else {
5769 5787                          err = EINVAL;
5770 5788                  }
5771 5789                  break;
5772 5790          case ZONE_ATTR_NETWORK:
5773 5791                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5774 5792                          err = EINVAL;
5775 5793                          break;
5776 5794                  }
5777 5795                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5778 5796                  if (copyin(buf, zbuf, bufsize) != 0) {
5779 5797                          kmem_free(zbuf, bufsize);
5780 5798                          err = EFAULT;
5781 5799                          break;
5782 5800                  }
5783 5801                  err = zone_set_network(zoneid, zbuf);
5784 5802                  kmem_free(zbuf, bufsize);
5785 5803                  break;
5786 5804          default:
5787 5805                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5788 5806                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5789 5807                  else
5790 5808                          err = EINVAL;
5791 5809          }
5792 5810  
5793 5811  done:
5794 5812          zone_rele(zone);
5795 5813          ASSERT(err != -1);
5796 5814          return (err != 0 ? set_errno(err) : 0);
5797 5815  }
5798 5816  
5799 5817  /*
5800 5818   * Return zero if the process has at least one vnode mapped in to its
5801 5819   * address space which shouldn't be allowed to change zones.
5802 5820   *
5803 5821   * Also return zero if the process has any shared mappings which reserve
5804 5822   * swap.  This is because the counting for zone.max-swap does not allow swap
5805 5823   * reservation to be shared between zones.  zone swap reservation is counted
5806 5824   * on zone->zone_max_swap.
5807 5825   */
5808 5826  static int
5809 5827  as_can_change_zones(void)
5810 5828  {
5811 5829          proc_t *pp = curproc;
5812 5830          struct seg *seg;
5813 5831          struct as *as = pp->p_as;
5814 5832          vnode_t *vp;
5815 5833          int allow = 1;
5816 5834  
5817 5835          ASSERT(pp->p_as != &kas);
5818 5836          AS_LOCK_ENTER(as, RW_READER);
5819 5837          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5820 5838  
5821 5839                  /*
5822 5840                   * Cannot enter zone with shared anon memory which
5823 5841                   * reserves swap.  See comment above.
5824 5842                   */
5825 5843                  if (seg_can_change_zones(seg) == B_FALSE) {
5826 5844                          allow = 0;
5827 5845                          break;
5828 5846                  }
5829 5847                  /*
5830 5848                   * if we can't get a backing vnode for this segment then skip
5831 5849                   * it.
5832 5850                   */
5833 5851                  vp = NULL;
5834 5852                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5835 5853                          continue;
5836 5854                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5837 5855                          allow = 0;
5838 5856                          break;
5839 5857                  }
5840 5858          }
5841 5859          AS_LOCK_EXIT(as);
5842 5860          return (allow);
5843 5861  }
5844 5862  
5845 5863  /*
5846 5864   * Count swap reserved by curproc's address space
5847 5865   */
5848 5866  static size_t
5849 5867  as_swresv(void)
5850 5868  {
5851 5869          proc_t *pp = curproc;
5852 5870          struct seg *seg;
5853 5871          struct as *as = pp->p_as;
5854 5872          size_t swap = 0;
5855 5873  
5856 5874          ASSERT(pp->p_as != &kas);
5857 5875          ASSERT(AS_WRITE_HELD(as));
5858 5876          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5859 5877                  swap += seg_swresv(seg);
5860 5878  
5861 5879          return (swap);
5862 5880  }
5863 5881  
5864 5882  /*
5865 5883   * Systemcall entry point for zone_enter().
5866 5884   *
5867 5885   * The current process is injected into said zone.  In the process
5868 5886   * it will change its project membership, privileges, rootdir/cwd,
5869 5887   * zone-wide rctls, and pool association to match those of the zone.
5870 5888   *
5871 5889   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5872 5890   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5873 5891   * enter a zone that is "ready" or "running".
5874 5892   */
5875 5893  static int
5876 5894  zone_enter(zoneid_t zoneid)
5877 5895  {
5878 5896          zone_t *zone;
5879 5897          vnode_t *vp;
5880 5898          proc_t *pp = curproc;
5881 5899          contract_t *ct;
5882 5900          cont_process_t *ctp;
5883 5901          task_t *tk, *oldtk;
5884 5902          kproject_t *zone_proj0;
5885 5903          cred_t *cr, *newcr;
5886 5904          pool_t *oldpool, *newpool;
5887 5905          sess_t *sp;
5888 5906          uid_t uid;
5889 5907          zone_status_t status;
5890 5908          int err = 0;
5891 5909          rctl_entity_p_t e;
5892 5910          size_t swap;
5893 5911          kthread_id_t t;
5894 5912  
5895 5913          if (secpolicy_zone_config(CRED()) != 0)
5896 5914                  return (set_errno(EPERM));
5897 5915          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5898 5916                  return (set_errno(EINVAL));
5899 5917  
5900 5918          /*
5901 5919           * Stop all lwps so we don't need to hold a lock to look at
5902 5920           * curproc->p_zone.  This needs to happen before we grab any
5903 5921           * locks to avoid deadlock (another lwp in the process could
5904 5922           * be waiting for the held lock).
5905 5923           */
5906 5924          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5907 5925                  return (set_errno(EINTR));
5908 5926  
5909 5927          /*
5910 5928           * Make sure we're not changing zones with files open or mapped in
5911 5929           * to our address space which shouldn't be changing zones.
5912 5930           */
5913 5931          if (!files_can_change_zones()) {
5914 5932                  err = EBADF;
5915 5933                  goto out;
5916 5934          }
5917 5935          if (!as_can_change_zones()) {
5918 5936                  err = EFAULT;
5919 5937                  goto out;
5920 5938          }
5921 5939  
5922 5940          mutex_enter(&zonehash_lock);
5923 5941          if (pp->p_zone != global_zone) {
5924 5942                  mutex_exit(&zonehash_lock);
5925 5943                  err = EINVAL;
5926 5944                  goto out;
5927 5945          }
5928 5946  
5929 5947          zone = zone_find_all_by_id(zoneid);
5930 5948          if (zone == NULL) {
5931 5949                  mutex_exit(&zonehash_lock);
5932 5950                  err = EINVAL;
5933 5951                  goto out;
5934 5952          }
5935 5953  
5936 5954          /*
5937 5955           * To prevent processes in a zone from holding contracts on
5938 5956           * extrazonal resources, and to avoid process contract
5939 5957           * memberships which span zones, contract holders and processes
5940 5958           * which aren't the sole members of their encapsulating process
5941 5959           * contracts are not allowed to zone_enter.
5942 5960           */
5943 5961          ctp = pp->p_ct_process;
5944 5962          ct = &ctp->conp_contract;
5945 5963          mutex_enter(&ct->ct_lock);
5946 5964          mutex_enter(&pp->p_lock);
5947 5965          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5948 5966                  mutex_exit(&pp->p_lock);
5949 5967                  mutex_exit(&ct->ct_lock);
5950 5968                  mutex_exit(&zonehash_lock);
5951 5969                  err = EINVAL;
5952 5970                  goto out;
5953 5971          }
5954 5972  
5955 5973          /*
5956 5974           * Moreover, we don't allow processes whose encapsulating
5957 5975           * process contracts have inherited extrazonal contracts.
5958 5976           * While it would be easier to eliminate all process contracts
5959 5977           * with inherited contracts, we need to be able to give a
5960 5978           * restarted init (or other zone-penetrating process) its
5961 5979           * predecessor's contracts.
5962 5980           */
5963 5981          if (ctp->conp_ninherited != 0) {
5964 5982                  contract_t *next;
5965 5983                  for (next = list_head(&ctp->conp_inherited); next;
5966 5984                      next = list_next(&ctp->conp_inherited, next)) {
5967 5985                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5968 5986                                  mutex_exit(&pp->p_lock);
5969 5987                                  mutex_exit(&ct->ct_lock);
5970 5988                                  mutex_exit(&zonehash_lock);
5971 5989                                  err = EINVAL;
5972 5990                                  goto out;
5973 5991                          }
5974 5992                  }
5975 5993          }
5976 5994  
5977 5995          mutex_exit(&pp->p_lock);
5978 5996          mutex_exit(&ct->ct_lock);
5979 5997  
5980 5998          status = zone_status_get(zone);
5981 5999          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5982 6000                  /*
5983 6001                   * Can't join
5984 6002                   */
5985 6003                  mutex_exit(&zonehash_lock);
5986 6004                  err = EINVAL;
5987 6005                  goto out;
5988 6006          }
5989 6007  
5990 6008          /*
5991 6009           * Make sure new priv set is within the permitted set for caller
5992 6010           */
5993 6011          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5994 6012                  mutex_exit(&zonehash_lock);
5995 6013                  err = EPERM;
5996 6014                  goto out;
5997 6015          }
5998 6016          /*
5999 6017           * We want to momentarily drop zonehash_lock while we optimistically
6000 6018           * bind curproc to the pool it should be running in.  This is safe
6001 6019           * since the zone can't disappear (we have a hold on it).
6002 6020           */
6003 6021          zone_hold(zone);
6004 6022          mutex_exit(&zonehash_lock);
6005 6023  
6006 6024          /*
6007 6025           * Grab pool_lock to keep the pools configuration from changing
6008 6026           * and to stop ourselves from getting rebound to another pool
6009 6027           * until we join the zone.
6010 6028           */
6011 6029          if (pool_lock_intr() != 0) {
6012 6030                  zone_rele(zone);
6013 6031                  err = EINTR;
6014 6032                  goto out;
6015 6033          }
6016 6034          ASSERT(secpolicy_pool(CRED()) == 0);
6017 6035          /*
6018 6036           * Bind ourselves to the pool currently associated with the zone.
6019 6037           */
6020 6038          oldpool = curproc->p_pool;
6021 6039          newpool = zone_pool_get(zone);
6022 6040          if (pool_state == POOL_ENABLED && newpool != oldpool &&
6023 6041              (err = pool_do_bind(newpool, P_PID, P_MYID,
6024 6042              POOL_BIND_ALL)) != 0) {
6025 6043                  pool_unlock();
6026 6044                  zone_rele(zone);
6027 6045                  goto out;
6028 6046          }
6029 6047  
6030 6048          /*
6031 6049           * Grab cpu_lock now; we'll need it later when we call
6032 6050           * task_join().
6033 6051           */
6034 6052          mutex_enter(&cpu_lock);
6035 6053          mutex_enter(&zonehash_lock);
6036 6054          /*
6037 6055           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6038 6056           */
6039 6057          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6040 6058                  /*
6041 6059                   * Can't join anymore.
6042 6060                   */
6043 6061                  mutex_exit(&zonehash_lock);
6044 6062                  mutex_exit(&cpu_lock);
6045 6063                  if (pool_state == POOL_ENABLED &&
6046 6064                      newpool != oldpool)
6047 6065                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
6048 6066                              POOL_BIND_ALL);
6049 6067                  pool_unlock();
6050 6068                  zone_rele(zone);
6051 6069                  err = EINVAL;
6052 6070                  goto out;
6053 6071          }
6054 6072  
6055 6073          /*
6056 6074           * a_lock must be held while transfering locked memory and swap
6057 6075           * reservation from the global zone to the non global zone because
6058 6076           * asynchronous faults on the processes' address space can lock
6059 6077           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6060 6078           * segments respectively.
6061 6079           */
6062 6080          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6063 6081          swap = as_swresv();
6064 6082          mutex_enter(&pp->p_lock);
6065 6083          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6066 6084          /* verify that we do not exceed and task or lwp limits */
6067 6085          mutex_enter(&zone->zone_nlwps_lock);
6068 6086          /* add new lwps to zone and zone's proj0 */
6069 6087          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6070 6088          zone->zone_nlwps += pp->p_lwpcnt;
6071 6089          /* add 1 task to zone's proj0 */
6072 6090          zone_proj0->kpj_ntasks += 1;
6073 6091  
6074 6092          zone_proj0->kpj_nprocs++;
6075 6093          zone->zone_nprocs++;
6076 6094          mutex_exit(&zone->zone_nlwps_lock);
6077 6095  
6078 6096          mutex_enter(&zone->zone_mem_lock);
6079 6097          zone->zone_locked_mem += pp->p_locked_mem;
6080 6098          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6081 6099          zone->zone_max_swap += swap;
6082 6100          mutex_exit(&zone->zone_mem_lock);
6083 6101  
6084 6102          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6085 6103          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6086 6104          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6087 6105  
6088 6106          /* remove lwps and process from proc's old zone and old project */
6089 6107          mutex_enter(&pp->p_zone->zone_nlwps_lock);
6090 6108          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6091 6109          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6092 6110          pp->p_task->tk_proj->kpj_nprocs--;
6093 6111          pp->p_zone->zone_nprocs--;
6094 6112          mutex_exit(&pp->p_zone->zone_nlwps_lock);
6095 6113  
6096 6114          mutex_enter(&pp->p_zone->zone_mem_lock);
6097 6115          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6098 6116          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6099 6117          pp->p_zone->zone_max_swap -= swap;
6100 6118          mutex_exit(&pp->p_zone->zone_mem_lock);
6101 6119  
6102 6120          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6103 6121          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6104 6122          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6105 6123  
6106 6124          pp->p_flag |= SZONETOP;
6107 6125          pp->p_zone = zone;
6108 6126          mutex_exit(&pp->p_lock);
6109 6127          AS_LOCK_EXIT(pp->p_as);
6110 6128  
6111 6129          /*
6112 6130           * Joining the zone cannot fail from now on.
6113 6131           *
6114 6132           * This means that a lot of the following code can be commonized and
6115 6133           * shared with zsched().
6116 6134           */
6117 6135  
6118 6136          /*
6119 6137           * If the process contract fmri was inherited, we need to
6120 6138           * flag this so that any contract status will not leak
6121 6139           * extra zone information, svc_fmri in this case
6122 6140           */
6123 6141          if (ctp->conp_svc_ctid != ct->ct_id) {
6124 6142                  mutex_enter(&ct->ct_lock);
6125 6143                  ctp->conp_svc_zone_enter = ct->ct_id;
6126 6144                  mutex_exit(&ct->ct_lock);
6127 6145          }
6128 6146  
6129 6147          /*
6130 6148           * Reset the encapsulating process contract's zone.
6131 6149           */
6132 6150          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6133 6151          contract_setzuniqid(ct, zone->zone_uniqid);
6134 6152  
6135 6153          /*
6136 6154           * Create a new task and associate the process with the project keyed
6137 6155           * by (projid,zoneid).
6138 6156           *
6139 6157           * We might as well be in project 0; the global zone's projid doesn't
6140 6158           * make much sense in a zone anyhow.
6141 6159           *
6142 6160           * This also increments zone_ntasks, and returns with p_lock held.
6143 6161           */
6144 6162          tk = task_create(0, zone);
6145 6163          oldtk = task_join(tk, 0);
6146 6164          mutex_exit(&cpu_lock);
6147 6165  
6148 6166          /*
6149 6167           * call RCTLOP_SET functions on this proc
6150 6168           */
6151 6169          e.rcep_p.zone = zone;
6152 6170          e.rcep_t = RCENTITY_ZONE;
6153 6171          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6154 6172              RCD_CALLBACK);
6155 6173          mutex_exit(&pp->p_lock);
6156 6174  
6157 6175          /*
6158 6176           * We don't need to hold any of zsched's locks here; not only do we know
6159 6177           * the process and zone aren't going away, we know its session isn't
6160 6178           * changing either.
6161 6179           *
6162 6180           * By joining zsched's session here, we mimic the behavior in the
6163 6181           * global zone of init's sid being the pid of sched.  We extend this
6164 6182           * to all zlogin-like zone_enter()'ing processes as well.
6165 6183           */
6166 6184          mutex_enter(&pidlock);
6167 6185          sp = zone->zone_zsched->p_sessp;
6168 6186          sess_hold(zone->zone_zsched);
6169 6187          mutex_enter(&pp->p_lock);
6170 6188          pgexit(pp);
6171 6189          sess_rele(pp->p_sessp, B_TRUE);
6172 6190          pp->p_sessp = sp;
6173 6191          pgjoin(pp, zone->zone_zsched->p_pidp);
6174 6192  
6175 6193          /*
6176 6194           * If any threads are scheduled to be placed on zone wait queue they
6177 6195           * should abandon the idea since the wait queue is changing.
6178 6196           * We need to be holding pidlock & p_lock to do this.
6179 6197           */
6180 6198          if ((t = pp->p_tlist) != NULL) {
6181 6199                  do {
6182 6200                          thread_lock(t);
6183 6201                          /*
6184 6202                           * Kick this thread so that it doesn't sit
6185 6203                           * on a wrong wait queue.
6186 6204                           */
6187 6205                          if (ISWAITING(t))
6188 6206                                  setrun_locked(t);
6189 6207  
6190 6208                          if (t->t_schedflag & TS_ANYWAITQ)
6191 6209                                  t->t_schedflag &= ~ TS_ANYWAITQ;
6192 6210  
6193 6211                          thread_unlock(t);
6194 6212                  } while ((t = t->t_forw) != pp->p_tlist);
6195 6213          }
6196 6214  
6197 6215          /*
6198 6216           * If there is a default scheduling class for the zone and it is not
6199 6217           * the class we are currently in, change all of the threads in the
6200 6218           * process to the new class.  We need to be holding pidlock & p_lock
6201 6219           * when we call parmsset so this is a good place to do it.
6202 6220           */
6203 6221          if (zone->zone_defaultcid > 0 &&
6204 6222              zone->zone_defaultcid != curthread->t_cid) {
6205 6223                  pcparms_t pcparms;
6206 6224  
6207 6225                  pcparms.pc_cid = zone->zone_defaultcid;
6208 6226                  pcparms.pc_clparms[0] = 0;
6209 6227  
6210 6228                  /*
6211 6229                   * If setting the class fails, we still want to enter the zone.
6212 6230                   */
6213 6231                  if ((t = pp->p_tlist) != NULL) {
6214 6232                          do {
6215 6233                                  (void) parmsset(&pcparms, t);
6216 6234                          } while ((t = t->t_forw) != pp->p_tlist);
6217 6235                  }
6218 6236          }
6219 6237  
6220 6238          mutex_exit(&pp->p_lock);
6221 6239          mutex_exit(&pidlock);
6222 6240  
6223 6241          mutex_exit(&zonehash_lock);
6224 6242          /*
6225 6243           * We're firmly in the zone; let pools progress.
6226 6244           */
6227 6245          pool_unlock();
6228 6246          task_rele(oldtk);
6229 6247          /*
6230 6248           * We don't need to retain a hold on the zone since we already
6231 6249           * incremented zone_ntasks, so the zone isn't going anywhere.
6232 6250           */
6233 6251          zone_rele(zone);
6234 6252  
6235 6253          /*
6236 6254           * Chroot
6237 6255           */
6238 6256          vp = zone->zone_rootvp;
6239 6257          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6240 6258          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6241 6259  
6242 6260          /*
6243 6261           * Change process security flags.  Note that the _effective_ flags
6244 6262           * cannot change
6245 6263           */
6246 6264          secflags_copy(&pp->p_secflags.psf_lower,
6247 6265              &zone->zone_secflags.psf_lower);
6248 6266          secflags_copy(&pp->p_secflags.psf_upper,
6249 6267              &zone->zone_secflags.psf_upper);
6250 6268          secflags_copy(&pp->p_secflags.psf_inherit,
6251 6269              &zone->zone_secflags.psf_inherit);
6252 6270  
6253 6271          /*
6254 6272           * Change process credentials
6255 6273           */
6256 6274          newcr = cralloc();
6257 6275          mutex_enter(&pp->p_crlock);
6258 6276          cr = pp->p_cred;
6259 6277          crcopy_to(cr, newcr);
6260 6278          crsetzone(newcr, zone);
6261 6279          pp->p_cred = newcr;
6262 6280  
6263 6281          /*
6264 6282           * Restrict all process privilege sets to zone limit
6265 6283           */
6266 6284          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6267 6285          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6268 6286          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6269 6287          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6270 6288          mutex_exit(&pp->p_crlock);
6271 6289          crset(pp, newcr);
6272 6290  
6273 6291          /*
6274 6292           * Adjust upcount to reflect zone entry.
6275 6293           */
6276 6294          uid = crgetruid(newcr);
6277 6295          mutex_enter(&pidlock);
6278 6296          upcount_dec(uid, GLOBAL_ZONEID);
6279 6297          upcount_inc(uid, zoneid);
6280 6298          mutex_exit(&pidlock);
6281 6299  
6282 6300          /*
6283 6301           * Set up core file path and content.
6284 6302           */
6285 6303          set_core_defaults();
6286 6304  
6287 6305  out:
6288 6306          /*
6289 6307           * Let the other lwps continue.
6290 6308           */
6291 6309          mutex_enter(&pp->p_lock);
6292 6310          if (curthread != pp->p_agenttp)
6293 6311                  continuelwps(pp);
6294 6312          mutex_exit(&pp->p_lock);
6295 6313  
6296 6314          return (err != 0 ? set_errno(err) : 0);
6297 6315  }
6298 6316  
6299 6317  /*
6300 6318   * Systemcall entry point for zone_list(2).
6301 6319   *
6302 6320   * Processes running in a (non-global) zone only see themselves.
6303 6321   * On labeled systems, they see all zones whose label they dominate.
6304 6322   */
6305 6323  static int
6306 6324  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6307 6325  {
6308 6326          zoneid_t *zoneids;
6309 6327          zone_t *zone, *myzone;
6310 6328          uint_t user_nzones, real_nzones;
6311 6329          uint_t domi_nzones;
6312 6330          int error;
6313 6331  
6314 6332          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6315 6333                  return (set_errno(EFAULT));
6316 6334  
6317 6335          myzone = curproc->p_zone;
6318 6336          ASSERT(zonecount > 0);
6319 6337          if (myzone != global_zone) {
6320 6338                  bslabel_t *mybslab;
6321 6339  
6322 6340                  if (!is_system_labeled()) {
6323 6341                          /* just return current zone */
6324 6342                          real_nzones = domi_nzones = 1;
6325 6343                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6326 6344                          zoneids[0] = myzone->zone_id;
6327 6345                  } else {
6328 6346                          /* return all zones that are dominated */
6329 6347                          mutex_enter(&zonehash_lock);
6330 6348                          real_nzones = zonecount;
6331 6349                          domi_nzones = 0;
6332 6350                          zoneids = kmem_alloc(real_nzones *
6333 6351                              sizeof (zoneid_t), KM_SLEEP);
6334 6352                          mybslab = label2bslabel(myzone->zone_slabel);
6335 6353                          for (zone = list_head(&zone_active);
6336 6354                              zone != NULL;
6337 6355                              zone = list_next(&zone_active, zone)) {
6338 6356                                  if (zone->zone_id == GLOBAL_ZONEID)
6339 6357                                          continue;
6340 6358                                  if (zone != myzone &&
6341 6359                                      (zone->zone_flags & ZF_IS_SCRATCH))
6342 6360                                          continue;
6343 6361                                  /*
6344 6362                                   * Note that a label always dominates
6345 6363                                   * itself, so myzone is always included
6346 6364                                   * in the list.
6347 6365                                   */
6348 6366                                  if (bldominates(mybslab,
6349 6367                                      label2bslabel(zone->zone_slabel))) {
6350 6368                                          zoneids[domi_nzones++] = zone->zone_id;
6351 6369                                  }
6352 6370                          }
6353 6371                          mutex_exit(&zonehash_lock);
6354 6372                  }
6355 6373          } else {
6356 6374                  mutex_enter(&zonehash_lock);
6357 6375                  real_nzones = zonecount;
6358 6376                  domi_nzones = 0;
6359 6377                  zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), KM_SLEEP);
6360 6378                  for (zone = list_head(&zone_active); zone != NULL;
6361 6379                      zone = list_next(&zone_active, zone))
6362 6380                          zoneids[domi_nzones++] = zone->zone_id;
6363 6381  
6364 6382                  ASSERT(domi_nzones == real_nzones);
6365 6383                  mutex_exit(&zonehash_lock);
6366 6384          }
6367 6385  
6368 6386          /*
6369 6387           * If user has allocated space for fewer entries than we found, then
6370 6388           * return only up to their limit.  Either way, tell them exactly how
6371 6389           * many we found.
6372 6390           */
6373 6391          if (domi_nzones < user_nzones)
6374 6392                  user_nzones = domi_nzones;
6375 6393          error = 0;
6376 6394          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6377 6395                  error = EFAULT;
6378 6396          } else if (zoneidlist != NULL && user_nzones != 0) {
6379 6397                  if (copyout(zoneids, zoneidlist,
6380 6398                      user_nzones * sizeof (zoneid_t)) != 0)
6381 6399                          error = EFAULT;
6382 6400          }
6383 6401  
6384 6402          kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6385 6403  
6386 6404          if (error != 0)
6387 6405                  return (set_errno(error));
6388 6406          else
6389 6407                  return (0);
6390 6408  }
6391 6409  
6392 6410  /*
6393 6411   * Systemcall entry point for zone_lookup(2).
6394 6412   *
6395 6413   * Non-global zones are only able to see themselves and (on labeled systems)
6396 6414   * the zones they dominate.
6397 6415   */
6398 6416  static zoneid_t
6399 6417  zone_lookup(const char *zone_name)
6400 6418  {
6401 6419          char *kname;
6402 6420          zone_t *zone;
6403 6421          zoneid_t zoneid;
6404 6422          int err;
6405 6423  
6406 6424          if (zone_name == NULL) {
6407 6425                  /* return caller's zone id */
6408 6426                  return (getzoneid());
6409 6427          }
6410 6428  
6411 6429          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6412 6430          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6413 6431                  kmem_free(kname, ZONENAME_MAX);
6414 6432                  return (set_errno(err));
6415 6433          }
6416 6434  
6417 6435          mutex_enter(&zonehash_lock);
6418 6436          zone = zone_find_all_by_name(kname);
6419 6437          kmem_free(kname, ZONENAME_MAX);
6420 6438          /*
6421 6439           * In a non-global zone, can only lookup global and own name.
6422 6440           * In Trusted Extensions zone label dominance rules apply.
6423 6441           */
6424 6442          if (zone == NULL ||
6425 6443              zone_status_get(zone) < ZONE_IS_READY ||
6426 6444              !zone_list_access(zone)) {
6427 6445                  mutex_exit(&zonehash_lock);
6428 6446                  return (set_errno(EINVAL));
6429 6447          } else {
6430 6448                  zoneid = zone->zone_id;
6431 6449                  mutex_exit(&zonehash_lock);
6432 6450                  return (zoneid);
6433 6451          }
6434 6452  }
6435 6453  
6436 6454  static int
6437 6455  zone_version(int *version_arg)
6438 6456  {
6439 6457          int version = ZONE_SYSCALL_API_VERSION;
6440 6458  
6441 6459          if (copyout(&version, version_arg, sizeof (int)) != 0)
6442 6460                  return (set_errno(EFAULT));
6443 6461          return (0);
6444 6462  }
6445 6463  
6446 6464  /* ARGSUSED */
6447 6465  long
6448 6466  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6449 6467  {
6450 6468          zone_def zs;
6451 6469          int err;
6452 6470  
6453 6471          switch (cmd) {
6454 6472          case ZONE_CREATE:
6455 6473                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6456 6474                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6457 6475                                  return (set_errno(EFAULT));
6458 6476                          }
6459 6477                  } else {
6460 6478  #ifdef _SYSCALL32_IMPL
6461 6479                          zone_def32 zs32;
6462 6480  
6463 6481                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6464 6482                                  return (set_errno(EFAULT));
6465 6483                          }
6466 6484                          zs.zone_name =
6467 6485                              (const char *)(unsigned long)zs32.zone_name;
6468 6486                          zs.zone_root =
6469 6487                              (const char *)(unsigned long)zs32.zone_root;
6470 6488                          zs.zone_privs =
6471 6489                              (const struct priv_set *)
6472 6490                              (unsigned long)zs32.zone_privs;
6473 6491                          zs.zone_privssz = zs32.zone_privssz;

↓ open down ↓

784 lines elided

↑ open up ↑

6474 6492                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6475 6493                          zs.rctlbufsz = zs32.rctlbufsz;
6476 6494                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6477 6495                          zs.zfsbufsz = zs32.zfsbufsz;
6478 6496                          zs.extended_error =
6479 6497                              (int *)(unsigned long)zs32.extended_error;
6480 6498                          zs.match = zs32.match;
6481 6499                          zs.doi = zs32.doi;
6482 6500                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6483 6501                          zs.flags = zs32.flags;
     6502 +                        zs.zone_did = zs32.zone_did;
6484 6503  #else
6485 6504                          panic("get_udatamodel() returned bogus result\n");
6486 6505  #endif
6487 6506                  }
6488 6507  
6489 6508                  return (zone_create(zs.zone_name, zs.zone_root,
6490 6509                      zs.zone_privs, zs.zone_privssz,
6491 6510                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6492 6511                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6493 6512                      zs.extended_error, zs.match, zs.doi,
6494      -                    zs.label, zs.flags));
     6513 +                    zs.label, zs.flags, zs.zone_did));
6495 6514          case ZONE_BOOT:
6496 6515                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6497 6516          case ZONE_DESTROY:
6498 6517                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6499 6518          case ZONE_GETATTR:
6500 6519                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6501 6520                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6502 6521          case ZONE_SETATTR:
6503 6522                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6504 6523                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));

6505 6524          case ZONE_ENTER:
6506 6525                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6507 6526          case ZONE_LIST:
6508 6527                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6509 6528          case ZONE_SHUTDOWN:
6510 6529                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6511 6530          case ZONE_LOOKUP:
6512 6531                  return (zone_lookup((const char *)arg1));
6513 6532          case ZONE_VERSION:
6514 6533                  return (zone_version((int *)arg1));
6515 6534          case ZONE_ADD_DATALINK:
6516 6535                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6517 6536                      (datalink_id_t)(uintptr_t)arg2));
6518 6537          case ZONE_DEL_DATALINK:
6519 6538                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6520 6539                      (datalink_id_t)(uintptr_t)arg2));
6521 6540          case ZONE_CHECK_DATALINK: {
6522 6541                  zoneid_t        zoneid;
6523 6542                  boolean_t       need_copyout;
6524 6543  
6525 6544                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6526 6545                          return (EFAULT);
6527 6546                  need_copyout = (zoneid == ALL_ZONES);
6528 6547                  err = zone_check_datalink(&zoneid,
6529 6548                      (datalink_id_t)(uintptr_t)arg2);
6530 6549                  if (err == 0 && need_copyout) {
6531 6550                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6532 6551                                  err = EFAULT;
6533 6552                  }
6534 6553                  return (err == 0 ? 0 : set_errno(err));
6535 6554          }
6536 6555          case ZONE_LIST_DATALINK:
6537 6556                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6538 6557                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6539 6558          default:
6540 6559                  return (set_errno(EINVAL));
6541 6560          }
6542 6561  }
6543 6562  
6544 6563  struct zarg {
6545 6564          zone_t *zone;
6546 6565          zone_cmd_arg_t arg;
6547 6566  };
6548 6567  
6549 6568  static int
6550 6569  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6551 6570  {
6552 6571          char *buf;
6553 6572          size_t buflen;
6554 6573          int error;
6555 6574  
6556 6575          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6557 6576          buf = kmem_alloc(buflen, KM_SLEEP);
6558 6577          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6559 6578          error = door_ki_open(buf, doorp);
6560 6579          kmem_free(buf, buflen);
6561 6580          return (error);
6562 6581  }
6563 6582  
6564 6583  static void
6565 6584  zone_release_door(door_handle_t *doorp)
6566 6585  {
6567 6586          door_ki_rele(*doorp);
6568 6587          *doorp = NULL;
6569 6588  }
6570 6589  
6571 6590  static void
6572 6591  zone_ki_call_zoneadmd(struct zarg *zargp)
6573 6592  {
6574 6593          door_handle_t door = NULL;
6575 6594          door_arg_t darg, save_arg;
6576 6595          char *zone_name;
6577 6596          size_t zone_namelen;
6578 6597          zoneid_t zoneid;
6579 6598          zone_t *zone;
6580 6599          zone_cmd_arg_t arg;
6581 6600          uint64_t uniqid;
6582 6601          size_t size;
6583 6602          int error;
6584 6603          int retry;
6585 6604  
6586 6605          zone = zargp->zone;
6587 6606          arg = zargp->arg;
6588 6607          kmem_free(zargp, sizeof (*zargp));
6589 6608  
6590 6609          zone_namelen = strlen(zone->zone_name) + 1;
6591 6610          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6592 6611          bcopy(zone->zone_name, zone_name, zone_namelen);
6593 6612          zoneid = zone->zone_id;
6594 6613          uniqid = zone->zone_uniqid;
6595 6614          /*
6596 6615           * zoneadmd may be down, but at least we can empty out the zone.
6597 6616           * We can ignore the return value of zone_empty() since we're called
6598 6617           * from a kernel thread and know we won't be delivered any signals.
6599 6618           */
6600 6619          ASSERT(curproc == &p0);
6601 6620          (void) zone_empty(zone);
6602 6621          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6603 6622          zone_rele(zone);
6604 6623  
6605 6624          size = sizeof (arg);
6606 6625          darg.rbuf = (char *)&arg;
6607 6626          darg.data_ptr = (char *)&arg;
6608 6627          darg.rsize = size;
6609 6628          darg.data_size = size;
6610 6629          darg.desc_ptr = NULL;
6611 6630          darg.desc_num = 0;
6612 6631  
6613 6632          save_arg = darg;
6614 6633          /*
6615 6634           * Since we're not holding a reference to the zone, any number of
6616 6635           * things can go wrong, including the zone disappearing before we get a
6617 6636           * chance to talk to zoneadmd.
6618 6637           */
6619 6638          for (retry = 0; /* forever */; retry++) {
6620 6639                  if (door == NULL &&
6621 6640                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6622 6641                          goto next;
6623 6642                  }
6624 6643                  ASSERT(door != NULL);
6625 6644  
6626 6645                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6627 6646                      SIZE_MAX, 0)) == 0) {
6628 6647                          break;
6629 6648                  }
6630 6649                  switch (error) {
6631 6650                  case EINTR:
6632 6651                          /* FALLTHROUGH */
6633 6652                  case EAGAIN:    /* process may be forking */
6634 6653                          /*
6635 6654                           * Back off for a bit
6636 6655                           */
6637 6656                          break;
6638 6657                  case EBADF:
6639 6658                          zone_release_door(&door);
6640 6659                          if (zone_lookup_door(zone_name, &door) != 0) {
6641 6660                                  /*
6642 6661                                   * zoneadmd may be dead, but it may come back to
6643 6662                                   * life later.
6644 6663                                   */
6645 6664                                  break;
6646 6665                          }
6647 6666                          break;
6648 6667                  default:
6649 6668                          cmn_err(CE_WARN,
6650 6669                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6651 6670                              error);
6652 6671                          goto out;
6653 6672                  }
6654 6673  next:
6655 6674                  /*
6656 6675                   * If this isn't the same zone_t that we originally had in mind,
6657 6676                   * then this is the same as if two kadmin requests come in at
6658 6677                   * the same time: the first one wins.  This means we lose, so we
6659 6678                   * bail.
6660 6679                   */
6661 6680                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6662 6681                          /*
6663 6682                           * Problem is solved.
6664 6683                           */
6665 6684                          break;
6666 6685                  }
6667 6686                  if (zone->zone_uniqid != uniqid) {
6668 6687                          /*
6669 6688                           * zoneid recycled
6670 6689                           */
6671 6690                          zone_rele(zone);
6672 6691                          break;
6673 6692                  }
6674 6693                  /*
6675 6694                   * We could zone_status_timedwait(), but there doesn't seem to
6676 6695                   * be much point in doing that (plus, it would mean that
6677 6696                   * zone_free() isn't called until this thread exits).
6678 6697                   */
6679 6698                  zone_rele(zone);
6680 6699                  delay(hz);
6681 6700                  darg = save_arg;
6682 6701          }
6683 6702  out:
6684 6703          if (door != NULL) {
6685 6704                  zone_release_door(&door);
6686 6705          }
6687 6706          kmem_free(zone_name, zone_namelen);
6688 6707          thread_exit();
6689 6708  }
6690 6709  
6691 6710  /*
6692 6711   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6693 6712   * kadmin().  The caller is a process in the zone.
6694 6713   *
6695 6714   * In order to shutdown the zone, we will hand off control to zoneadmd
6696 6715   * (running in the global zone) via a door.  We do a half-hearted job at
6697 6716   * killing all processes in the zone, create a kernel thread to contact
6698 6717   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6699 6718   * a form of generation number used to let zoneadmd (as well as
6700 6719   * zone_destroy()) know exactly which zone they're re talking about.
6701 6720   */
6702 6721  int
6703 6722  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6704 6723  {
6705 6724          struct zarg *zargp;
6706 6725          zone_cmd_t zcmd;
6707 6726          zone_t *zone;
6708 6727  
6709 6728          zone = curproc->p_zone;
6710 6729          ASSERT(getzoneid() != GLOBAL_ZONEID);
6711 6730  
6712 6731          switch (cmd) {
6713 6732          case A_SHUTDOWN:
6714 6733                  switch (fcn) {
6715 6734                  case AD_HALT:
6716 6735                  case AD_POWEROFF:
6717 6736                          zcmd = Z_HALT;
6718 6737                          break;
6719 6738                  case AD_BOOT:
6720 6739                          zcmd = Z_REBOOT;
6721 6740                          break;
6722 6741                  case AD_IBOOT:
6723 6742                  case AD_SBOOT:
6724 6743                  case AD_SIBOOT:
6725 6744                  case AD_NOSYNC:
6726 6745                          return (ENOTSUP);
6727 6746                  default:
6728 6747                          return (EINVAL);
6729 6748                  }
6730 6749                  break;
6731 6750          case A_REBOOT:
6732 6751                  zcmd = Z_REBOOT;
6733 6752                  break;
6734 6753          case A_FTRACE:
6735 6754          case A_REMOUNT:
6736 6755          case A_FREEZE:
6737 6756          case A_DUMP:
6738 6757          case A_CONFIG:
6739 6758                  return (ENOTSUP);
6740 6759          default:
6741 6760                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6742 6761                  return (EINVAL);
6743 6762          }
6744 6763  
6745 6764          if (secpolicy_zone_admin(credp, B_FALSE))
6746 6765                  return (EPERM);
6747 6766          mutex_enter(&zone_status_lock);
6748 6767  
6749 6768          /*
6750 6769           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6751 6770           * is in the zone.
6752 6771           */
6753 6772          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6754 6773          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6755 6774                  /*
6756 6775                   * This zone is already on its way down.
6757 6776                   */
6758 6777                  mutex_exit(&zone_status_lock);
6759 6778                  return (0);
6760 6779          }
6761 6780          /*
6762 6781           * Prevent future zone_enter()s
6763 6782           */
6764 6783          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6765 6784          mutex_exit(&zone_status_lock);
6766 6785  
6767 6786          /*
6768 6787           * Kill everyone now and call zoneadmd later.
6769 6788           * zone_ki_call_zoneadmd() will do a more thorough job of this
6770 6789           * later.
6771 6790           */
6772 6791          killall(zone->zone_id);
6773 6792          /*
6774 6793           * Now, create the thread to contact zoneadmd and do the rest of the
6775 6794           * work.  This thread can't be created in our zone otherwise
6776 6795           * zone_destroy() would deadlock.
6777 6796           */
6778 6797          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6779 6798          zargp->arg.cmd = zcmd;
6780 6799          zargp->arg.uniqid = zone->zone_uniqid;
6781 6800          zargp->zone = zone;
6782 6801          (void) strcpy(zargp->arg.locale, "C");
6783 6802          /* mdep was already copied in for us by uadmin */
6784 6803          if (mdep != NULL)
6785 6804                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6786 6805                      sizeof (zargp->arg.bootbuf));
6787 6806          zone_hold(zone);
6788 6807  
6789 6808          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6790 6809              TS_RUN, minclsyspri);
6791 6810          exit(CLD_EXITED, 0);
6792 6811  
6793 6812          return (EINVAL);
6794 6813  }
6795 6814  
6796 6815  /*
6797 6816   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6798 6817   * status to ZONE_IS_SHUTTING_DOWN.
6799 6818   *
6800 6819   * This function also shuts down all running zones to ensure that they won't
6801 6820   * fork new processes.
6802 6821   */
6803 6822  void
6804 6823  zone_shutdown_global(void)
6805 6824  {
6806 6825          zone_t *current_zonep;
6807 6826  
6808 6827          ASSERT(INGLOBALZONE(curproc));
6809 6828          mutex_enter(&zonehash_lock);
6810 6829          mutex_enter(&zone_status_lock);
6811 6830  
6812 6831          /* Modify the global zone's status first. */
6813 6832          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6814 6833          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6815 6834  
6816 6835          /*
6817 6836           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6818 6837           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6819 6838           * could cause assertions to fail (e.g., assertions about a zone's
6820 6839           * state during initialization, readying, or booting) or produce races.
6821 6840           * We'll let threads continue to initialize and ready new zones: they'll
6822 6841           * fail to boot the new zones when they see that the global zone is
6823 6842           * shutting down.
6824 6843           */
6825 6844          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6826 6845              current_zonep = list_next(&zone_active, current_zonep)) {
6827 6846                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6828 6847                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6829 6848          }
6830 6849          mutex_exit(&zone_status_lock);
6831 6850          mutex_exit(&zonehash_lock);
6832 6851  }
6833 6852  
6834 6853  /*
6835 6854   * Returns true if the named dataset is visible in the current zone.
6836 6855   * The 'write' parameter is set to 1 if the dataset is also writable.
6837 6856   */
6838 6857  int
6839 6858  zone_dataset_visible(const char *dataset, int *write)
6840 6859  {
6841 6860          static int zfstype = -1;
6842 6861          zone_dataset_t *zd;
6843 6862          size_t len;
6844 6863          zone_t *zone = curproc->p_zone;
6845 6864          const char *name = NULL;
6846 6865          vfs_t *vfsp = NULL;
6847 6866  
6848 6867          if (dataset[0] == '\0')
6849 6868                  return (0);
6850 6869  
6851 6870          /*
6852 6871           * Walk the list once, looking for datasets which match exactly, or
6853 6872           * specify a dataset underneath an exported dataset.  If found, return
6854 6873           * true and note that it is writable.
6855 6874           */
6856 6875          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6857 6876              zd = list_next(&zone->zone_datasets, zd)) {
6858 6877  
6859 6878                  len = strlen(zd->zd_dataset);
6860 6879                  if (strlen(dataset) >= len &&
6861 6880                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6862 6881                      (dataset[len] == '\0' || dataset[len] == '/' ||
6863 6882                      dataset[len] == '@')) {
6864 6883                          if (write)
6865 6884                                  *write = 1;
6866 6885                          return (1);
6867 6886                  }
6868 6887          }
6869 6888  
6870 6889          /*
6871 6890           * Walk the list a second time, searching for datasets which are parents
6872 6891           * of exported datasets.  These should be visible, but read-only.
6873 6892           *
6874 6893           * Note that we also have to support forms such as 'pool/dataset/', with
6875 6894           * a trailing slash.
6876 6895           */
6877 6896          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6878 6897              zd = list_next(&zone->zone_datasets, zd)) {
6879 6898  
6880 6899                  len = strlen(dataset);
6881 6900                  if (dataset[len - 1] == '/')
6882 6901                          len--;  /* Ignore trailing slash */
6883 6902                  if (len < strlen(zd->zd_dataset) &&
6884 6903                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6885 6904                      zd->zd_dataset[len] == '/') {
6886 6905                          if (write)
6887 6906                                  *write = 0;
6888 6907                          return (1);
6889 6908                  }
6890 6909          }
6891 6910  
6892 6911          /*
6893 6912           * We reach here if the given dataset is not found in the zone_dataset
6894 6913           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6895 6914           * instead of delegation. For this we search for the dataset in the
6896 6915           * zone_vfslist of this zone. If found, return true and note that it is
6897 6916           * not writable.
6898 6917           */
6899 6918  
6900 6919          /*
6901 6920           * Initialize zfstype if it is not initialized yet.
6902 6921           */
6903 6922          if (zfstype == -1) {
6904 6923                  struct vfssw *vswp = vfs_getvfssw("zfs");
6905 6924                  zfstype = vswp - vfssw;
6906 6925                  vfs_unrefvfssw(vswp);
6907 6926          }
6908 6927  
6909 6928          vfs_list_read_lock();
6910 6929          vfsp = zone->zone_vfslist;
6911 6930          do {
6912 6931                  ASSERT(vfsp);
6913 6932                  if (vfsp->vfs_fstype == zfstype) {
6914 6933                          name = refstr_value(vfsp->vfs_resource);
6915 6934  
6916 6935                          /*
6917 6936                           * Check if we have an exact match.
6918 6937                           */
6919 6938                          if (strcmp(dataset, name) == 0) {
6920 6939                                  vfs_list_unlock();
6921 6940                                  if (write)
6922 6941                                          *write = 0;
6923 6942                                  return (1);
6924 6943                          }
6925 6944                          /*
6926 6945                           * We need to check if we are looking for parents of
6927 6946                           * a dataset. These should be visible, but read-only.
6928 6947                           */
6929 6948                          len = strlen(dataset);
6930 6949                          if (dataset[len - 1] == '/')
6931 6950                                  len--;
6932 6951  
6933 6952                          if (len < strlen(name) &&
6934 6953                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6935 6954                                  vfs_list_unlock();
6936 6955                                  if (write)
6937 6956                                          *write = 0;
6938 6957                                  return (1);
6939 6958                          }
6940 6959                  }
6941 6960                  vfsp = vfsp->vfs_zone_next;
6942 6961          } while (vfsp != zone->zone_vfslist);
6943 6962  
6944 6963          vfs_list_unlock();
6945 6964          return (0);
6946 6965  }
6947 6966  
6948 6967  /*
6949 6968   * zone_find_by_any_path() -
6950 6969   *
6951 6970   * kernel-private routine similar to zone_find_by_path(), but which
6952 6971   * effectively compares against zone paths rather than zonerootpath
6953 6972   * (i.e., the last component of zonerootpaths, which should be "root/",
6954 6973   * are not compared.)  This is done in order to accurately identify all
6955 6974   * paths, whether zone-visible or not, including those which are parallel
6956 6975   * to /root/, such as /dev/, /home/, etc...
6957 6976   *
6958 6977   * If the specified path does not fall under any zone path then global
6959 6978   * zone is returned.
6960 6979   *
6961 6980   * The treat_abs parameter indicates whether the path should be treated as
6962 6981   * an absolute path although it does not begin with "/".  (This supports
6963 6982   * nfs mount syntax such as host:any/path.)
6964 6983   *
6965 6984   * The caller is responsible for zone_rele of the returned zone.
6966 6985   */
6967 6986  zone_t *
6968 6987  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6969 6988  {
6970 6989          zone_t *zone;
6971 6990          int path_offset = 0;
6972 6991  
6973 6992          if (path == NULL) {
6974 6993                  zone_hold(global_zone);
6975 6994                  return (global_zone);
6976 6995          }
6977 6996  
6978 6997          if (*path != '/') {
6979 6998                  ASSERT(treat_abs);
6980 6999                  path_offset = 1;
6981 7000          }
6982 7001  
6983 7002          mutex_enter(&zonehash_lock);
6984 7003          for (zone = list_head(&zone_active); zone != NULL;
6985 7004              zone = list_next(&zone_active, zone)) {
6986 7005                  char    *c;
6987 7006                  size_t  pathlen;
6988 7007                  char *rootpath_start;
6989 7008  
6990 7009                  if (zone == global_zone)        /* skip global zone */
6991 7010                          continue;
6992 7011  
6993 7012                  /* scan backwards to find start of last component */
6994 7013                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6995 7014                  do {
6996 7015                          c--;
6997 7016                  } while (*c != '/');
6998 7017  
6999 7018                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
7000 7019                  rootpath_start = (zone->zone_rootpath + path_offset);
7001 7020                  if (strncmp(path, rootpath_start, pathlen) == 0)
7002 7021                          break;
7003 7022          }
7004 7023          if (zone == NULL)
7005 7024                  zone = global_zone;
7006 7025          zone_hold(zone);
7007 7026          mutex_exit(&zonehash_lock);
7008 7027          return (zone);
7009 7028  }
7010 7029  
7011 7030  /*
7012 7031   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7013 7032   * zone_dl_t pointer if found, and NULL otherwise.
7014 7033   */
7015 7034  static zone_dl_t *
7016 7035  zone_find_dl(zone_t *zone, datalink_id_t linkid)
7017 7036  {
7018 7037          zone_dl_t *zdl;
7019 7038  
7020 7039          ASSERT(mutex_owned(&zone->zone_lock));
7021 7040          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7022 7041              zdl = list_next(&zone->zone_dl_list, zdl)) {
7023 7042                  if (zdl->zdl_id == linkid)
7024 7043                          break;
7025 7044          }
7026 7045          return (zdl);
7027 7046  }
7028 7047  
7029 7048  static boolean_t
7030 7049  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7031 7050  {
7032 7051          boolean_t exists;
7033 7052  
7034 7053          mutex_enter(&zone->zone_lock);
7035 7054          exists = (zone_find_dl(zone, linkid) != NULL);
7036 7055          mutex_exit(&zone->zone_lock);
7037 7056          return (exists);
7038 7057  }
7039 7058  
7040 7059  /*
7041 7060   * Add an data link name for the zone.
7042 7061   */
7043 7062  static int
7044 7063  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7045 7064  {
7046 7065          zone_dl_t *zdl;
7047 7066          zone_t *zone;
7048 7067          zone_t *thiszone;
7049 7068  
7050 7069          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7051 7070                  return (set_errno(ENXIO));
7052 7071  
7053 7072          /* Verify that the datalink ID doesn't already belong to a zone. */
7054 7073          mutex_enter(&zonehash_lock);
7055 7074          for (zone = list_head(&zone_active); zone != NULL;
7056 7075              zone = list_next(&zone_active, zone)) {
7057 7076                  if (zone_dl_exists(zone, linkid)) {
7058 7077                          mutex_exit(&zonehash_lock);
7059 7078                          zone_rele(thiszone);
7060 7079                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7061 7080                  }
7062 7081          }
7063 7082  
7064 7083          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7065 7084          zdl->zdl_id = linkid;
7066 7085          zdl->zdl_net = NULL;
7067 7086          mutex_enter(&thiszone->zone_lock);
7068 7087          list_insert_head(&thiszone->zone_dl_list, zdl);
7069 7088          mutex_exit(&thiszone->zone_lock);
7070 7089          mutex_exit(&zonehash_lock);
7071 7090          zone_rele(thiszone);
7072 7091          return (0);
7073 7092  }
7074 7093  
7075 7094  static int
7076 7095  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7077 7096  {
7078 7097          zone_dl_t *zdl;
7079 7098          zone_t *zone;
7080 7099          int err = 0;
7081 7100  
7082 7101          if ((zone = zone_find_by_id(zoneid)) == NULL)
7083 7102                  return (set_errno(EINVAL));
7084 7103  
7085 7104          mutex_enter(&zone->zone_lock);
7086 7105          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7087 7106                  err = ENXIO;
7088 7107          } else {
7089 7108                  list_remove(&zone->zone_dl_list, zdl);
7090 7109                  nvlist_free(zdl->zdl_net);
7091 7110                  kmem_free(zdl, sizeof (zone_dl_t));
7092 7111          }
7093 7112          mutex_exit(&zone->zone_lock);
7094 7113          zone_rele(zone);
7095 7114          return (err == 0 ? 0 : set_errno(err));
7096 7115  }
7097 7116  
7098 7117  /*
7099 7118   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7100 7119   * the linkid.  Otherwise we just check if the specified zoneidp has been
7101 7120   * assigned the supplied linkid.
7102 7121   */
7103 7122  int
7104 7123  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7105 7124  {
7106 7125          zone_t *zone;
7107 7126          int err = ENXIO;
7108 7127  
7109 7128          if (*zoneidp != ALL_ZONES) {
7110 7129                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7111 7130                          if (zone_dl_exists(zone, linkid))
7112 7131                                  err = 0;
7113 7132                          zone_rele(zone);
7114 7133                  }
7115 7134                  return (err);
7116 7135          }
7117 7136  
7118 7137          mutex_enter(&zonehash_lock);
7119 7138          for (zone = list_head(&zone_active); zone != NULL;
7120 7139              zone = list_next(&zone_active, zone)) {
7121 7140                  if (zone_dl_exists(zone, linkid)) {
7122 7141                          *zoneidp = zone->zone_id;
7123 7142                          err = 0;
7124 7143                          break;
7125 7144                  }
7126 7145          }
7127 7146          mutex_exit(&zonehash_lock);
7128 7147          return (err);
7129 7148  }
7130 7149  
7131 7150  /*
7132 7151   * Get the list of datalink IDs assigned to a zone.
7133 7152   *
7134 7153   * On input, *nump is the number of datalink IDs that can fit in the supplied
7135 7154   * idarray.  Upon return, *nump is either set to the number of datalink IDs
7136 7155   * that were placed in the array if the array was large enough, or to the
7137 7156   * number of datalink IDs that the function needs to place in the array if the
7138 7157   * array is too small.
7139 7158   */
7140 7159  static int
7141 7160  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7142 7161  {
7143 7162          uint_t num, dlcount;
7144 7163          zone_t *zone;
7145 7164          zone_dl_t *zdl;
7146 7165          datalink_id_t *idptr = idarray;
7147 7166  
7148 7167          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7149 7168                  return (set_errno(EFAULT));
7150 7169          if ((zone = zone_find_by_id(zoneid)) == NULL)
7151 7170                  return (set_errno(ENXIO));
7152 7171  
7153 7172          num = 0;
7154 7173          mutex_enter(&zone->zone_lock);
7155 7174          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7156 7175              zdl = list_next(&zone->zone_dl_list, zdl)) {
7157 7176                  /*
7158 7177                   * If the list is bigger than what the caller supplied, just
7159 7178                   * count, don't do copyout.
7160 7179                   */
7161 7180                  if (++num > dlcount)
7162 7181                          continue;
7163 7182                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7164 7183                          mutex_exit(&zone->zone_lock);
7165 7184                          zone_rele(zone);
7166 7185                          return (set_errno(EFAULT));
7167 7186                  }
7168 7187                  idptr++;
7169 7188          }
7170 7189          mutex_exit(&zone->zone_lock);
7171 7190          zone_rele(zone);
7172 7191  
7173 7192          /* Increased or decreased, caller should be notified. */
7174 7193          if (num != dlcount) {
7175 7194                  if (copyout(&num, nump, sizeof (num)) != 0)
7176 7195                          return (set_errno(EFAULT));
7177 7196          }
7178 7197          return (0);
7179 7198  }
7180 7199  
7181 7200  /*
7182 7201   * Public interface for looking up a zone by zoneid. It's a customized version
7183 7202   * for netstack_zone_create(). It can only be called from the zsd create
7184 7203   * callbacks, since it doesn't have reference on the zone structure hence if
7185 7204   * it is called elsewhere the zone could disappear after the zonehash_lock
7186 7205   * is dropped.
7187 7206   *
7188 7207   * Furthermore it
7189 7208   * 1. Doesn't check the status of the zone.
7190 7209   * 2. It will be called even before zone_init is called, in that case the
7191 7210   *    address of zone0 is returned directly, and netstack_zone_create()
7192 7211   *    will only assign a value to zone0.zone_netstack, won't break anything.
7193 7212   * 3. Returns without the zone being held.
7194 7213   */
7195 7214  zone_t *
7196 7215  zone_find_by_id_nolock(zoneid_t zoneid)
7197 7216  {
7198 7217          zone_t *zone;
7199 7218  
7200 7219          mutex_enter(&zonehash_lock);
7201 7220          if (zonehashbyid == NULL)
7202 7221                  zone = &zone0;
7203 7222          else
7204 7223                  zone = zone_find_all_by_id(zoneid);
7205 7224          mutex_exit(&zonehash_lock);
7206 7225          return (zone);
7207 7226  }
7208 7227  
7209 7228  /*
7210 7229   * Walk the datalinks for a given zone
7211 7230   */
7212 7231  int
7213 7232  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7214 7233      void *data)
7215 7234  {
7216 7235          zone_t          *zone;
7217 7236          zone_dl_t       *zdl;
7218 7237          datalink_id_t   *idarray;
7219 7238          uint_t          idcount = 0;
7220 7239          int             i, ret = 0;
7221 7240  
7222 7241          if ((zone = zone_find_by_id(zoneid)) == NULL)
7223 7242                  return (ENOENT);
7224 7243  
7225 7244          /*
7226 7245           * We first build an array of linkid's so that we can walk these and
7227 7246           * execute the callback with the zone_lock dropped.
7228 7247           */
7229 7248          mutex_enter(&zone->zone_lock);
7230 7249          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7231 7250              zdl = list_next(&zone->zone_dl_list, zdl)) {
7232 7251                  idcount++;
7233 7252          }
7234 7253  
7235 7254          if (idcount == 0) {
7236 7255                  mutex_exit(&zone->zone_lock);
7237 7256                  zone_rele(zone);
7238 7257                  return (0);
7239 7258          }
7240 7259  
7241 7260          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7242 7261          if (idarray == NULL) {
7243 7262                  mutex_exit(&zone->zone_lock);
7244 7263                  zone_rele(zone);
7245 7264                  return (ENOMEM);
7246 7265          }
7247 7266  
7248 7267          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7249 7268              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7250 7269                  idarray[i] = zdl->zdl_id;
7251 7270          }
7252 7271  
7253 7272          mutex_exit(&zone->zone_lock);
7254 7273  
7255 7274          for (i = 0; i < idcount && ret == 0; i++) {
7256 7275                  if ((ret = (*cb)(idarray[i], data)) != 0)
7257 7276                          break;
7258 7277          }
7259 7278  
7260 7279          zone_rele(zone);
7261 7280          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7262 7281          return (ret);
7263 7282  }
7264 7283  
7265 7284  static char *
7266 7285  zone_net_type2name(int type)
7267 7286  {
7268 7287          switch (type) {
7269 7288          case ZONE_NETWORK_ADDRESS:
7270 7289                  return (ZONE_NET_ADDRNAME);
7271 7290          case ZONE_NETWORK_DEFROUTER:
7272 7291                  return (ZONE_NET_RTRNAME);
7273 7292          default:
7274 7293                  return (NULL);
7275 7294          }
7276 7295  }
7277 7296  
7278 7297  static int
7279 7298  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7280 7299  {
7281 7300          zone_t *zone;
7282 7301          zone_dl_t *zdl;
7283 7302          nvlist_t *nvl;
7284 7303          int err = 0;
7285 7304          uint8_t *new = NULL;
7286 7305          char *nvname;
7287 7306          int bufsize;
7288 7307          datalink_id_t linkid = znbuf->zn_linkid;
7289 7308  
7290 7309          if (secpolicy_zone_config(CRED()) != 0)
7291 7310                  return (set_errno(EPERM));
7292 7311  
7293 7312          if (zoneid == GLOBAL_ZONEID)
7294 7313                  return (set_errno(EINVAL));
7295 7314  
7296 7315          nvname = zone_net_type2name(znbuf->zn_type);
7297 7316          bufsize = znbuf->zn_len;
7298 7317          new = znbuf->zn_val;
7299 7318          if (nvname == NULL)
7300 7319                  return (set_errno(EINVAL));
7301 7320  
7302 7321          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7303 7322                  return (set_errno(EINVAL));
7304 7323          }
7305 7324  
7306 7325          mutex_enter(&zone->zone_lock);
7307 7326          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7308 7327                  err = ENXIO;
7309 7328                  goto done;
7310 7329          }
7311 7330          if ((nvl = zdl->zdl_net) == NULL) {
7312 7331                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7313 7332                          err = ENOMEM;
7314 7333                          goto done;
7315 7334                  } else {
7316 7335                          zdl->zdl_net = nvl;
7317 7336                  }
7318 7337          }
7319 7338          if (nvlist_exists(nvl, nvname)) {
7320 7339                  err = EINVAL;
7321 7340                  goto done;
7322 7341          }
7323 7342          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7324 7343          ASSERT(err == 0);
7325 7344  done:
7326 7345          mutex_exit(&zone->zone_lock);
7327 7346          zone_rele(zone);
7328 7347          if (err != 0)
7329 7348                  return (set_errno(err));
7330 7349          else
7331 7350                  return (0);
7332 7351  }
7333 7352  
7334 7353  static int
7335 7354  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7336 7355  {
7337 7356          zone_t *zone;
7338 7357          zone_dl_t *zdl;
7339 7358          nvlist_t *nvl;
7340 7359          uint8_t *ptr;
7341 7360          uint_t psize;
7342 7361          int err = 0;
7343 7362          char *nvname;
7344 7363          int bufsize;
7345 7364          void *buf;
7346 7365          datalink_id_t linkid = znbuf->zn_linkid;
7347 7366  
7348 7367          if (zoneid == GLOBAL_ZONEID)
7349 7368                  return (set_errno(EINVAL));
7350 7369  
7351 7370          nvname = zone_net_type2name(znbuf->zn_type);
7352 7371          bufsize = znbuf->zn_len;
7353 7372          buf = znbuf->zn_val;
7354 7373  
7355 7374          if (nvname == NULL)
7356 7375                  return (set_errno(EINVAL));
7357 7376          if ((zone = zone_find_by_id(zoneid)) == NULL)
7358 7377                  return (set_errno(EINVAL));
7359 7378  
7360 7379          mutex_enter(&zone->zone_lock);
7361 7380          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7362 7381                  err = ENXIO;
7363 7382                  goto done;
7364 7383          }
7365 7384          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7366 7385                  err = ENOENT;
7367 7386                  goto done;
7368 7387          }
7369 7388          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7370 7389          ASSERT(err == 0);
7371 7390  
7372 7391          if (psize > bufsize) {
7373 7392                  err = ENOBUFS;
7374 7393                  goto done;
7375 7394          }
7376 7395          znbuf->zn_len = psize;
7377 7396          bcopy(ptr, buf, psize);
7378 7397  done:
7379 7398          mutex_exit(&zone->zone_lock);
7380 7399          zone_rele(zone);
7381 7400          if (err != 0)
7382 7401                  return (set_errno(err));
7383 7402          else
7384 7403                  return (0);
7385 7404  }

↓ open down ↓

881 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX