1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1988 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  31  */
  32 
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/pcb.h>
  37 #include <sys/systm.h>
  38 #include <sys/signal.h>
  39 #include <sys/cred.h>
  40 #include <sys/user.h>
  41 #include <sys/vfs.h>
  42 #include <sys/vnode.h>
  43 #include <sys/proc.h>
  44 #include <sys/time.h>
  45 #include <sys/file.h>
  46 #include <sys/priocntl.h>
  47 #include <sys/procset.h>
  48 #include <sys/disp.h>
  49 #include <sys/callo.h>
  50 #include <sys/callb.h>
  51 #include <sys/debug.h>
  52 #include <sys/conf.h>
  53 #include <sys/bootconf.h>
  54 #include <sys/utsname.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/vmparam.h>
  57 #include <sys/modctl.h>
  58 #include <sys/vm.h>
  59 #include <sys/callb.h>
  60 #include <sys/ddi_periodic.h>
  61 #include <sys/kmem.h>
  62 #include <sys/vmem.h>
  63 #include <sys/cpuvar.h>
  64 #include <sys/cladm.h>
  65 #include <sys/corectl.h>
  66 #include <sys/exec.h>
  67 #include <sys/syscall.h>
  68 #include <sys/reboot.h>
  69 #include <sys/task.h>
  70 #include <sys/exacct.h>
  71 #include <sys/autoconf.h>
  72 #include <sys/errorq.h>
  73 #include <sys/class.h>
  74 #include <sys/stack.h>
  75 #include <sys/brand.h>
  76 #include <sys/mmapobj.h>
  77 
  78 #include <vm/as.h>
  79 #include <vm/seg_kmem.h>
  80 #include <sys/dc_ki.h>
  81 
  82 #include <c2/audit.h>
  83 #include <sys/bootprops.h>
  84 
  85 /* well known processes */
  86 proc_t *proc_sched;             /* memory scheduler */
  87 proc_t *proc_init;              /* init */
  88 proc_t *proc_pageout;           /* pageout daemon */
  89 proc_t *proc_fsflush;           /* fsflush daemon */
  90 proc_t *proc_intrd;             /* interrupt daemon */
  91 
  92 pgcnt_t maxmem;         /* Maximum available memory in pages.   */
  93 pgcnt_t freemem;        /* Current available memory in pages.   */
  94 int     interrupts_unleashed;   /* set when we do the first spl0() */
  95 
  96 kmem_cache_t *process_cache;    /* kmem cache for proc structures */
  97 
  98 /*
  99  * Indicates whether the auditing module (c2audit) is loaded. Possible
 100  * values are:
 101  * 0 - c2audit module is excluded in /etc/system and cannot be loaded
 102  * 1 - c2audit module is not loaded but can be anytime
 103  * 2 - c2audit module is loaded
 104  */
 105 int audit_active = C2AUDIT_DISABLED;
 106 
 107 /*
 108  * Process 0's lwp directory and lwpid hash table.
 109  */
 110 lwpdir_t p0_lwpdir[2];
 111 tidhash_t p0_tidhash[2];
 112 lwpent_t p0_lep;
 113 
 114 /*
 115  * Machine-independent initialization code
 116  * Called from cold start routine as
 117  * soon as a stack and segmentation
 118  * have been established.
 119  * Functions:
 120  *      clear and free user core
 121  *      turn on clock
 122  *      hand craft 0th process
 123  *      call all initialization routines
 124  *      fork    - process 0 to schedule
 125  *              - process 1 execute bootstrap
 126  *              - process 2 to page out
 127  *      create system threads
 128  */
 129 
 130 int cluster_bootflags = 0;
 131 
 132 void
 133 cluster_wrapper(void)
 134 {
 135         cluster();
 136         panic("cluster()  returned");
 137 }
 138 
 139 char initname[INITNAME_SZ] = "/sbin/init";      /* also referenced by zone0 */
 140 char initargs[BOOTARGS_MAX] = "";               /* also referenced by zone0 */
 141 
 142 /*
 143  * Construct a stack for init containing the arguments to it, then
 144  * pass control to exec_common.
 145  */
 146 int
 147 exec_init(const char *initpath, const char *args)
 148 {
 149         caddr32_t ucp;
 150         caddr32_t *uap;
 151         caddr32_t *argv;
 152         caddr32_t exec_fnamep;
 153         char *scratchargs;
 154         int i, sarg;
 155         size_t argvlen, alen;
 156         boolean_t in_arg;
 157         int argc = 0;
 158         int error = 0, count = 0;
 159         proc_t *p = ttoproc(curthread);
 160         klwp_t *lwp = ttolwp(curthread);
 161         int brand_action;
 162 
 163         if (args == NULL)
 164                 args = "";
 165 
 166         alen = strlen(initpath) + 1 + strlen(args) + 1;
 167         scratchargs = kmem_alloc(alen, KM_SLEEP);
 168         (void) snprintf(scratchargs, alen, "%s %s", initpath, args);
 169 
 170         /*
 171          * We do a quick two state parse of the string to sort out how big
 172          * argc should be.
 173          */
 174         in_arg = B_FALSE;
 175         for (i = 0; i < strlen(scratchargs); i++) {
 176                 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
 177                         if (in_arg) {
 178                                 in_arg = B_FALSE;
 179                                 argc++;
 180                         }
 181                 } else {
 182                         in_arg = B_TRUE;
 183                 }
 184         }
 185         argvlen = sizeof (caddr32_t) * (argc + 1);
 186         argv = kmem_zalloc(argvlen, KM_SLEEP);
 187 
 188         /*
 189          * We pull off a bit of a hack here.  We work our way through the
 190          * args string, putting nulls at the ends of space delimited tokens
 191          * (boot args don't support quoting at this time).  Then we just
 192          * copy the whole mess to userland in one go.  In other words, we
 193          * transform this: "init -s -r\0" into this on the stack:
 194          *
 195          *      -0x00 \0
 196          *      -0x01 r
 197          *      -0x02 -  <--------.
 198          *      -0x03 \0          |
 199          *      -0x04 s           |
 200          *      -0x05 -  <------. |
 201          *      -0x06 \0        | |
 202          *      -0x07 t         | |
 203          *      -0x08 i         | |
 204          *      -0x09 n         | |
 205          *      -0x0a i  <---.  | |
 206          *      -0x10 NULL   |  | |     (argv[3])
 207          *      -0x14   -----|--|-'     (argv[2])
 208          *      -0x18  ------|--'       (argv[1])
 209          *      -0x1c -------'          (argv[0])
 210          *
 211          * Since we know the value of ucp at the beginning of this process,
 212          * we can trivially compute the argv[] array which we also need to
 213          * place in userland: argv[i] = ucp - sarg(i), where ucp is the
 214          * stack ptr, and sarg is the string index of the start of the
 215          * argument.
 216          */
 217         ucp = (caddr32_t)(uintptr_t)p->p_usrstack;
 218 
 219         argc = 0;
 220         in_arg = B_FALSE;
 221         sarg = 0;
 222 
 223         for (i = 0; i < alen; i++) {
 224                 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
 225                         if (in_arg == B_TRUE) {
 226                                 in_arg = B_FALSE;
 227                                 scratchargs[i] = '\0';
 228                                 argv[argc++] = ucp - (alen - sarg);
 229                         }
 230                 } else if (in_arg == B_FALSE) {
 231                         in_arg = B_TRUE;
 232                         sarg = i;
 233                 }
 234         }
 235         ucp -= alen;
 236         error |= copyout(scratchargs, (caddr_t)(uintptr_t)ucp, alen);
 237 
 238         uap = (caddr32_t *)P2ALIGN((uintptr_t)ucp, sizeof (caddr32_t));
 239         uap--;  /* advance to be below the word we're in */
 240         uap -= (argc + 1);      /* advance argc words down, plus one for NULL */
 241         error |= copyout(argv, uap, argvlen);
 242 
 243         if (error != 0) {
 244                 zcmn_err(p->p_zone->zone_id, CE_WARN,
 245                     "Could not construct stack for init.\n");
 246                 kmem_free(argv, argvlen);
 247                 kmem_free(scratchargs, alen);
 248                 return (EFAULT);
 249         }
 250 
 251         exec_fnamep = argv[0];
 252         kmem_free(argv, argvlen);
 253         kmem_free(scratchargs, alen);
 254 
 255         /*
 256          * Point at the arguments.
 257          */
 258         lwp->lwp_ap = lwp->lwp_arg;
 259         lwp->lwp_arg[0] = (uintptr_t)exec_fnamep;
 260         lwp->lwp_arg[1] = (uintptr_t)uap;
 261         lwp->lwp_arg[2] = NULL;
 262         curthread->t_post_sys = 1;
 263         curthread->t_sysnum = SYS_execve;
 264 
 265         /*
 266          * If we are executing init from zsched, we may have inherited its
 267          * parent process's signal mask.  Clear it now so that we behave in
 268          * the same way as when started from the global zone.
 269          */
 270         sigemptyset(&curthread->t_hold);
 271 
 272         brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
 273 again:
 274         error = exec_common((const char *)(uintptr_t)exec_fnamep,
 275             (const char **)(uintptr_t)uap, NULL, brand_action);
 276 
 277         /*
 278          * Normally we would just set lwp_argsaved and t_post_sys and
 279          * let post_syscall reset lwp_ap for us.  Unfortunately,
 280          * exec_init isn't always called from a system call.  Instead
 281          * of making a mess of trap_cleanup, we just reset the args
 282          * pointer here.
 283          */
 284         reset_syscall_args();
 285 
 286         switch (error) {
 287         case 0:
 288                 return (0);
 289 
 290         case ENOENT:
 291                 zcmn_err(p->p_zone->zone_id, CE_WARN,
 292                     "exec(%s) failed (file not found).\n", initpath);
 293                 return (ENOENT);
 294 
 295         case EAGAIN:
 296         case EINTR:
 297                 ++count;
 298                 if (count < 5) {
 299                         zcmn_err(p->p_zone->zone_id, CE_WARN,
 300                             "exec(%s) failed with errno %d.  Retrying...\n",
 301                             initpath, error);
 302                         goto again;
 303                 }
 304         }
 305 
 306         zcmn_err(p->p_zone->zone_id, CE_WARN,
 307             "exec(%s) failed with errno %d.", initpath, error);
 308         return (error);
 309 }
 310 
 311 /*
 312  * This routine does all of the common setup for invoking init; global
 313  * and non-global zones employ this routine for the functionality which is
 314  * in common.
 315  *
 316  * This program (init, presumably) must be a 32-bit process.
 317  */
 318 int
 319 start_init_common()
 320 {
 321         proc_t *p = curproc;
 322         ASSERT_STACK_ALIGNED();
 323         p->p_zone->zone_proc_initpid = p->p_pid;
 324 
 325         p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
 326         p->p_usrstack = (caddr_t)USRSTACK32;
 327         p->p_model = DATAMODEL_ILP32;
 328         p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
 329         p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
 330         p->p_stk_ctl = INT32_MAX;
 331 
 332         p->p_as = as_alloc();
 333         p->p_as->a_proc = p;
 334         p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
 335         (void) hat_setup(p->p_as->a_hat, HAT_INIT);
 336 
 337         init_core();
 338 
 339         init_mstate(curthread, LMS_SYSTEM);
 340         return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs));
 341 }
 342 
 343 /*
 344  * Start the initial user process for the global zone; once running, if
 345  * init should subsequently fail, it will be automatically be caught in the
 346  * exit(2) path, and restarted by restart_init().
 347  */
 348 static void
 349 start_init(void)
 350 {
 351         proc_init = curproc;
 352 
 353         ASSERT(curproc->p_zone->zone_initname != NULL);
 354 
 355         if (start_init_common() != 0)
 356                 halt("unix: Could not start init");
 357         lwp_rtt();
 358 }
 359 
 360 void
 361 main(void)
 362 {
 363         proc_t          *p = ttoproc(curthread);        /* &p0 */
 364         int             (**initptr)();
 365         extern void     sched();
 366         extern void     fsflush();
 367         extern void     interrupt_balancer();
 368         extern int      (*init_tbl[])();
 369         extern int      (*mp_init_tbl[])();
 370         extern id_t     syscid, defaultcid;
 371         extern int      swaploaded;
 372         extern int      netboot;
 373         extern ib_boot_prop_t *iscsiboot_prop;
 374         extern void     vm_init(void);
 375         extern void     cbe_init_pre(void);
 376         extern void     cbe_init(void);
 377         extern void     clock_tick_init_pre(void);
 378         extern void     clock_tick_init_post(void);
 379         extern void     clock_init(void);
 380         extern void     physio_bufs_init(void);
 381         extern void     pm_cfb_setup_intr(void);
 382         extern int      pm_adjust_timestamps(dev_info_t *, void *);
 383         extern void     start_other_cpus(int);
 384         extern void     sysevent_evc_thrinit();
 385         extern kmutex_t ualock;
 386 #if defined(__x86)
 387         extern void     fastboot_post_startup(void);
 388         extern void     progressbar_start(void);
 389 #endif
 390         /*
 391          * In the horrible world of x86 in-lines, you can't get symbolic
 392          * structure offsets a la genassym.  This assertion is here so
 393          * that the next poor slob who innocently changes the offset of
 394          * cpu_thread doesn't waste as much time as I just did finding
 395          * out that it's hard-coded in i86/ml/i86.il.  Similarly for
 396          * curcpup.  You're welcome.
 397          */
 398         ASSERT(CPU == CPU->cpu_self);
 399         ASSERT(curthread == CPU->cpu_thread);
 400         ASSERT_STACK_ALIGNED();
 401 
 402         /*
 403          * We take the ualock until we have completed the startup
 404          * to prevent kadmin() from disrupting this work. In particular,
 405          * we don't want kadmin() to bring the system down while we are
 406          * trying to start it up.
 407          */
 408         mutex_enter(&ualock);
 409 
 410         /*
 411          * Setup root lgroup and leaf lgroup for CPU 0
 412          */
 413         lgrp_init(LGRP_INIT_STAGE2);
 414 
 415         /*
 416          * Once 'startup()' completes, the thread_reaper() daemon would be
 417          * created(in thread_init()). After that, it is safe to create threads
 418          * that could exit. These exited threads will get reaped.
 419          */
 420         startup();
 421         segkmem_gc();
 422         callb_init();
 423         cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */
 424         ddi_periodic_init();
 425         cbe_init();
 426         callout_init(); /* callout table MUST be init'd after cyclics */
 427         clock_tick_init_pre();
 428         clock_init();
 429 
 430 #if defined(__x86)
 431         /*
 432          * The progressbar thread uses cv_reltimedwait() and hence needs to be
 433          * started after the callout mechanism has been initialized.
 434          */
 435         progressbar_start();
 436 #endif
 437         /*
 438          * On some platforms, clkinitf() changes the timing source that
 439          * gethrtime_unscaled() uses to generate timestamps.  cbe_init() calls
 440          * clkinitf(), so re-initialize the microstate counters after the
 441          * timesource has been chosen.
 442          */
 443         init_mstate(&t0, LMS_SYSTEM);
 444         init_cpu_mstate(CPU, CMS_SYSTEM);
 445 
 446         /*
 447          * May need to probe to determine latencies from CPU 0 after
 448          * gethrtime() comes alive in cbe_init() and before enabling interrupts
 449          * and copy and release any temporary memory allocated with BOP_ALLOC()
 450          * before release_bootstrap() frees boot memory
 451          */
 452         lgrp_init(LGRP_INIT_STAGE3);
 453 
 454         /*
 455          * Call all system initialization functions.
 456          */
 457         for (initptr = &init_tbl[0]; *initptr; initptr++)
 458                 (**initptr)();
 459         /*
 460          * Load iSCSI boot properties
 461          */
 462         ld_ib_prop();
 463         /*
 464          * initialize vm related stuff.
 465          */
 466         vm_init();
 467 
 468         /*
 469          * initialize buffer pool for raw I/O requests
 470          */
 471         physio_bufs_init();
 472 
 473         ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */
 474 
 475         /*
 476          * Drop the interrupt level and allow interrupts.  At this point
 477          * the DDI guarantees that interrupts are enabled.
 478          */
 479         (void) spl0();
 480         interrupts_unleashed = 1;
 481 
 482         /*
 483          * Create kmem cache for proc structures
 484          */
 485         process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
 486             0, NULL, NULL, NULL, NULL, NULL, 0);
 487 
 488         vfs_mountroot();        /* Mount the root file system */
 489         errorq_init();          /* after vfs_mountroot() so DDI root is ready */
 490         cpu_kstat_init(CPU);    /* after vfs_mountroot() so TOD is valid */
 491         ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL);
 492                                 /* after vfs_mountroot() so hrestime is valid */
 493 
 494         post_startup();
 495         swaploaded = 1;
 496 
 497         /*
 498          * Initialize Solaris Audit Subsystem
 499          */
 500         audit_init();
 501 
 502         /*
 503          * Plumb the protocol modules and drivers only if we are not
 504          * networked booted, in this case we already did it in rootconf().
 505          */
 506         if (netboot == 0 && iscsiboot_prop == NULL)
 507                 (void) strplumb();
 508 
 509         gethrestime(&PTOU(curproc)->u_start);
 510         curthread->t_start = PTOU(curproc)->u_start.tv_sec;
 511         p->p_mstart = gethrtime();
 512 
 513         /*
 514          * Perform setup functions that can only be done after root
 515          * and swap have been set up.
 516          */
 517         consconfig();
 518 #ifndef __sparc
 519         release_bootstrap();
 520 #endif
 521 
 522         /*
 523          * attach drivers with ddi-forceattach prop
 524          * It must be done early enough to load hotplug drivers (e.g.
 525          * pcmcia nexus) so that devices enumerated via hotplug is
 526          * available before I/O subsystem is fully initialized.
 527          */
 528         i_ddi_forceattach_drivers();
 529 
 530         /*
 531          * Set the scan rate and other parameters of the paging subsystem.
 532          */
 533         setupclock(0);
 534 
 535         /*
 536          * Initialize process 0's lwp directory and lwpid hash table.
 537          */
 538         p->p_lwpdir = p->p_lwpfree = p0_lwpdir;
 539         p->p_lwpdir->ld_next = p->p_lwpdir + 1;
 540         p->p_lwpdir_sz = 2;
 541         p->p_tidhash = p0_tidhash;
 542         p->p_tidhash_sz = 2;
 543         p0_lep.le_thread = curthread;
 544         p0_lep.le_lwpid = curthread->t_tid;
 545         p0_lep.le_start = curthread->t_start;
 546         lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0);
 547 
 548         /*
 549          * Initialize extended accounting.
 550          */
 551         exacct_init();
 552 
 553         /*
 554          * Initialize threads of sysevent event channels
 555          */
 556         sysevent_evc_thrinit();
 557 
 558         /*
 559          * This must be done after post_startup() but before
 560          * start_other_cpus()
 561          */
 562         lgrp_init(LGRP_INIT_STAGE4);
 563 
 564         /*
 565          * Perform MP initialization, if any.
 566          */
 567         start_other_cpus(0);
 568 
 569 #ifdef  __sparc
 570         /*
 571          * Release bootstrap here since PROM interfaces are
 572          * used to start other CPUs above.
 573          */
 574         release_bootstrap();
 575 #endif
 576 
 577         /*
 578          * Finish lgrp initialization after all CPUS are brought online.
 579          */
 580         lgrp_init(LGRP_INIT_STAGE5);
 581 
 582         /*
 583          * After mp_init(), number of cpus are known (this is
 584          * true for the time being, when there are actually
 585          * hot pluggable cpus then this scheme  would not do).
 586          * Any per cpu initialization is done here.
 587          */
 588         kmem_mp_init();
 589         vmem_update(NULL);
 590 
 591         clock_tick_init_post();
 592 
 593         for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
 594                 (**initptr)();
 595 
 596         /*
 597          * These must be called after start_other_cpus
 598          */
 599         pm_cfb_setup_intr();
 600 #if defined(__x86)
 601         fastboot_post_startup();
 602 #endif
 603 
 604         /*
 605          * Make init process; enter scheduling loop with system process.
 606          *
 607          * Note that we manually assign the pids for these processes, for
 608          * historical reasons.  If more pre-assigned pids are needed,
 609          * FAMOUS_PIDS will have to be updated.
 610          */
 611 
 612         /* create init process */
 613         if (newproc(start_init, NULL, defaultcid, 59, NULL,
 614             FAMOUS_PID_INIT))
 615                 panic("main: unable to fork init.");
 616 
 617         /* create pageout daemon */
 618         if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL,
 619             FAMOUS_PID_PAGEOUT))
 620                 panic("main: unable to fork pageout()");
 621 
 622         /* create fsflush daemon */
 623         if (newproc(fsflush, NULL, syscid, minclsyspri, NULL,
 624             FAMOUS_PID_FSFLUSH))
 625                 panic("main: unable to fork fsflush()");
 626 
 627         /* create interrupt balancer process */
 628         if (newproc(interrupt_balancer, NULL, syscid, minclsyspri, NULL, 0))
 629                 panic("main: unable to fork interrupt_balancer()");
 630 
 631         /* create cluster process if we're a member of one */
 632         if (cluster_bootflags & CLUSTER_BOOTED) {
 633                 if (newproc(cluster_wrapper, NULL, syscid, minclsyspri,
 634                     NULL, 0)) {
 635                         panic("main: unable to fork cluster()");
 636                 }
 637         }
 638 
 639         /*
 640          * Create system threads (threads are associated with p0)
 641          */
 642 
 643         /* create module uninstall daemon */
 644         /* BugID 1132273. If swapping over NFS need a bigger stack */
 645         (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon,
 646             NULL, 0, &p0, TS_RUN, minclsyspri);
 647 
 648         (void) thread_create(NULL, 0, seg_pasync_thread,
 649             NULL, 0, &p0, TS_RUN, minclsyspri);
 650 
 651         pid_setmin();
 652 
 653         /* system is now ready */
 654         mutex_exit(&ualock);
 655 
 656         bcopy("sched", PTOU(curproc)->u_psargs, 6);
 657         bcopy("sched", PTOU(curproc)->u_comm, 5);
 658         sched();
 659         /* NOTREACHED */
 660 }