1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1988 AT&T */
27 /* All Rights Reserved */
28
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/pcb.h>
33 #include <sys/systm.h>
34 #include <sys/signal.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/proc.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/priocntl.h>
43 #include <sys/procset.h>
44 #include <sys/disp.h>
45 #include <sys/callo.h>
46 #include <sys/callb.h>
47 #include <sys/debug.h>
48 #include <sys/conf.h>
49 #include <sys/bootconf.h>
50 #include <sys/utsname.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vmparam.h>
53 #include <sys/modctl.h>
54 #include <sys/vm.h>
55 #include <sys/callb.h>
56 #include <sys/ddi_timer.h>
57 #include <sys/kmem.h>
58 #include <sys/vmem.h>
59 #include <sys/cpuvar.h>
60 #include <sys/cladm.h>
61 #include <sys/corectl.h>
62 #include <sys/exec.h>
63 #include <sys/syscall.h>
64 #include <sys/reboot.h>
65 #include <sys/task.h>
66 #include <sys/exacct.h>
67 #include <sys/autoconf.h>
68 #include <sys/errorq.h>
69 #include <sys/class.h>
70 #include <sys/stack.h>
71 #include <sys/brand.h>
72 #include <sys/mmapobj.h>
73
74 #include <vm/as.h>
75 #include <vm/seg_kmem.h>
76 #include <sys/dc_ki.h>
77
78 #include <c2/audit.h>
79 #include <sys/bootprops.h>
80
81 /* well known processes */
82 proc_t *proc_sched; /* memory scheduler */
83 proc_t *proc_init; /* init */
84 proc_t *proc_pageout; /* pageout daemon */
85 proc_t *proc_fsflush; /* fsflush daemon */
86 proc_t *proc_intrd; /* interrupt daemon */
87
88 pgcnt_t maxmem; /* Maximum available memory in pages. */
89 pgcnt_t freemem; /* Current available memory in pages. */
90 int interrupts_unleashed; /* set when we do the first spl0() */
91
92 kmem_cache_t *process_cache; /* kmem cache for proc structures */
93
94 /*
95 * Indicates whether the auditing module (c2audit) is loaded. Possible
96 * values are:
97 * 0 - c2audit module is excluded in /etc/system and cannot be loaded
98 * 1 - c2audit module is not loaded but can be anytime
99 * 2 - c2audit module is loaded
100 */
101 int audit_active = C2AUDIT_DISABLED;
102
103 /*
104 * Process 0's lwp directory and lwpid hash table.
105 */
106 lwpdir_t p0_lwpdir[2];
107 tidhash_t p0_tidhash[2];
108 lwpent_t p0_lep;
109
110 /*
111 * Machine-independent initialization code
112 * Called from cold start routine as
113 * soon as a stack and segmentation
114 * have been established.
115 * Functions:
116 * clear and free user core
117 * turn on clock
118 * hand craft 0th process
119 * call all initialization routines
120 * fork - process 0 to schedule
121 * - process 1 execute bootstrap
122 * - process 2 to page out
123 * create system threads
124 */
125
126 int cluster_bootflags = 0;
127
128 void
129 cluster_wrapper(void)
130 {
131 cluster();
132 panic("cluster() returned");
133 }
134
135 char initname[INITNAME_SZ] = "/sbin/init"; /* also referenced by zone0 */
136 char initargs[BOOTARGS_MAX] = ""; /* also referenced by zone0 */
137
138 /*
139 * Construct a stack for init containing the arguments to it, then
140 * pass control to exec_common.
141 */
142 int
143 exec_init(const char *initpath, const char *args)
144 {
145 caddr32_t ucp;
146 caddr32_t *uap;
147 caddr32_t *argv;
148 caddr32_t exec_fnamep;
149 char *scratchargs;
150 int i, sarg;
151 size_t argvlen, alen;
152 boolean_t in_arg;
153 int argc = 0;
154 int error = 0, count = 0;
155 proc_t *p = ttoproc(curthread);
156 klwp_t *lwp = ttolwp(curthread);
157 int brand_action;
158
159 if (args == NULL)
160 args = "";
161
162 alen = strlen(initpath) + 1 + strlen(args) + 1;
163 scratchargs = kmem_alloc(alen, KM_SLEEP);
164 (void) snprintf(scratchargs, alen, "%s %s", initpath, args);
165
166 /*
167 * We do a quick two state parse of the string to sort out how big
168 * argc should be.
169 */
170 in_arg = B_FALSE;
171 for (i = 0; i < strlen(scratchargs); i++) {
172 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
173 if (in_arg) {
174 in_arg = B_FALSE;
175 argc++;
176 }
177 } else {
178 in_arg = B_TRUE;
179 }
180 }
181 argvlen = sizeof (caddr32_t) * (argc + 1);
182 argv = kmem_zalloc(argvlen, KM_SLEEP);
183
184 /*
185 * We pull off a bit of a hack here. We work our way through the
186 * args string, putting nulls at the ends of space delimited tokens
187 * (boot args don't support quoting at this time). Then we just
188 * copy the whole mess to userland in one go. In other words, we
189 * transform this: "init -s -r\0" into this on the stack:
190 *
191 * -0x00 \0
192 * -0x01 r
193 * -0x02 - <--------.
194 * -0x03 \0 |
195 * -0x04 s |
196 * -0x05 - <------. |
197 * -0x06 \0 | |
198 * -0x07 t | |
199 * -0x08 i | |
200 * -0x09 n | |
201 * -0x0a i <---. | |
202 * -0x10 NULL | | | (argv[3])
203 * -0x14 -----|--|-' (argv[2])
204 * -0x18 ------|--' (argv[1])
205 * -0x1c -------' (argv[0])
206 *
207 * Since we know the value of ucp at the beginning of this process,
208 * we can trivially compute the argv[] array which we also need to
209 * place in userland: argv[i] = ucp - sarg(i), where ucp is the
210 * stack ptr, and sarg is the string index of the start of the
211 * argument.
212 */
213 ucp = (caddr32_t)(uintptr_t)p->p_usrstack;
214
215 argc = 0;
216 in_arg = B_FALSE;
217 sarg = 0;
218
219 for (i = 0; i < alen; i++) {
220 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
221 if (in_arg == B_TRUE) {
222 in_arg = B_FALSE;
223 scratchargs[i] = '\0';
224 argv[argc++] = ucp - (alen - sarg);
225 }
226 } else if (in_arg == B_FALSE) {
227 in_arg = B_TRUE;
228 sarg = i;
229 }
230 }
231 ucp -= alen;
232 error |= copyout(scratchargs, (caddr_t)(uintptr_t)ucp, alen);
233
234 uap = (caddr32_t *)P2ALIGN((uintptr_t)ucp, sizeof (caddr32_t));
235 uap--; /* advance to be below the word we're in */
236 uap -= (argc + 1); /* advance argc words down, plus one for NULL */
237 error |= copyout(argv, uap, argvlen);
238
239 if (error != 0) {
240 zcmn_err(p->p_zone->zone_id, CE_WARN,
241 "Could not construct stack for init.\n");
242 kmem_free(argv, argvlen);
243 kmem_free(scratchargs, alen);
244 return (EFAULT);
245 }
246
247 exec_fnamep = argv[0];
248 kmem_free(argv, argvlen);
249 kmem_free(scratchargs, alen);
250
251 /*
252 * Point at the arguments.
253 */
254 lwp->lwp_ap = lwp->lwp_arg;
255 lwp->lwp_arg[0] = (uintptr_t)exec_fnamep;
256 lwp->lwp_arg[1] = (uintptr_t)uap;
257 lwp->lwp_arg[2] = NULL;
258 curthread->t_post_sys = 1;
259 curthread->t_sysnum = SYS_execve;
260
261 /*
262 * If we are executing init from zsched, we may have inherited its
263 * parent process's signal mask. Clear it now so that we behave in
264 * the same way as when started from the global zone.
265 */
266 sigemptyset(&curthread->t_hold);
267
268 brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
269 again:
270 error = exec_common((const char *)(uintptr_t)exec_fnamep,
271 (const char **)(uintptr_t)uap, NULL, brand_action);
272
273 /*
274 * Normally we would just set lwp_argsaved and t_post_sys and
275 * let post_syscall reset lwp_ap for us. Unfortunately,
276 * exec_init isn't always called from a system call. Instead
277 * of making a mess of trap_cleanup, we just reset the args
278 * pointer here.
279 */
280 reset_syscall_args();
281
282 switch (error) {
283 case 0:
284 return (0);
285
286 case ENOENT:
287 zcmn_err(p->p_zone->zone_id, CE_WARN,
288 "exec(%s) failed (file not found).\n", initpath);
289 return (ENOENT);
290
291 case EAGAIN:
292 case EINTR:
293 ++count;
294 if (count < 5) {
295 zcmn_err(p->p_zone->zone_id, CE_WARN,
296 "exec(%s) failed with errno %d. Retrying...\n",
297 initpath, error);
298 goto again;
299 }
300 }
301
302 zcmn_err(p->p_zone->zone_id, CE_WARN,
303 "exec(%s) failed with errno %d.", initpath, error);
304 return (error);
305 }
306
307 /*
308 * This routine does all of the common setup for invoking init; global
309 * and non-global zones employ this routine for the functionality which is
310 * in common.
311 *
312 * This program (init, presumably) must be a 32-bit process.
313 */
314 int
315 start_init_common()
316 {
317 proc_t *p = curproc;
318 ASSERT_STACK_ALIGNED();
319 p->p_zone->zone_proc_initpid = p->p_pid;
320
321 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
322 p->p_usrstack = (caddr_t)USRSTACK32;
323 p->p_model = DATAMODEL_ILP32;
324 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
325 p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
326 p->p_stk_ctl = INT32_MAX;
327
328 p->p_as = as_alloc();
329 p->p_as->a_proc = p;
330 p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
331 (void) hat_setup(p->p_as->a_hat, HAT_INIT);
332
333 init_core();
334
335 init_mstate(curthread, LMS_SYSTEM);
336 return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs));
337 }
338
339 /*
340 * Start the initial user process for the global zone; once running, if
341 * init should subsequently fail, it will be automatically be caught in the
342 * exit(2) path, and restarted by restart_init().
343 */
344 static void
345 start_init(void)
346 {
347 proc_init = curproc;
348
349 ASSERT(curproc->p_zone->zone_initname != NULL);
350
351 if (start_init_common() != 0)
352 halt("unix: Could not start init");
353 lwp_rtt();
354 }
355
356 void
357 main(void)
358 {
359 proc_t *p = ttoproc(curthread); /* &p0 */
360 int (**initptr)();
361 extern void sched();
362 extern void fsflush();
363 extern void intrd();
364 extern int (*init_tbl[])();
365 extern int (*mp_init_tbl[])();
366 extern id_t syscid, defaultcid;
367 extern int swaploaded;
368 extern int netboot;
369 extern ib_boot_prop_t *iscsiboot_prop;
370 extern void vm_init(void);
371 extern void cbe_init_pre(void);
372 extern void cbe_init(void);
373 extern void clock_tick_init_pre(void);
374 extern void clock_tick_init_post(void);
375 extern void clock_init(void);
376 extern void physio_bufs_init(void);
377 extern void pm_cfb_setup_intr(void);
378 extern int pm_adjust_timestamps(dev_info_t *, void *);
379 extern void start_other_cpus(int);
380 extern void sysevent_evc_thrinit();
381 extern kmutex_t ualock;
382 #if defined(__x86)
383 extern void fastboot_post_startup(void);
384 extern void progressbar_start(void);
385 #endif
386 /*
387 * In the horrible world of x86 in-lines, you can't get symbolic
388 * structure offsets a la genassym. This assertion is here so
389 * that the next poor slob who innocently changes the offset of
390 * cpu_thread doesn't waste as much time as I just did finding
391 * out that it's hard-coded in i86/ml/i86.il. Similarly for
392 * curcpup. You're welcome.
393 */
394 ASSERT(CPU == CPU->cpu_self);
395 ASSERT(curthread == CPU->cpu_thread);
396 ASSERT_STACK_ALIGNED();
397
398 /*
399 * We take the ualock until we have completed the startup
400 * to prevent kadmin() from disrupting this work. In particular,
401 * we don't want kadmin() to bring the system down while we are
402 * trying to start it up.
403 */
404 mutex_enter(&ualock);
405
406 /*
407 * Setup root lgroup and leaf lgroup for CPU 0
408 */
409 lgrp_init(LGRP_INIT_STAGE2);
410
411 /*
412 * Once 'startup()' completes, the thread_reaper() daemon would be
413 * created(in thread_init()). After that, it is safe to create threads
414 * that could exit. These exited threads will get reaped.
415 */
416 startup();
417 segkmem_gc();
418 callb_init();
419 cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */
420 timer_init(); /* timer must be initialized before cyclic starts */
421 cbe_init();
422 callout_init(); /* callout table MUST be init'd after cyclics */
423 clock_tick_init_pre();
424 clock_init();
425
426 #if defined(__x86)
427 /*
428 * The progressbar thread uses cv_reltimedwait() and hence needs to be
429 * started after the callout mechanism has been initialized.
430 */
431 progressbar_start();
432 #endif
433 /*
434 * On some platforms, clkinitf() changes the timing source that
435 * gethrtime_unscaled() uses to generate timestamps. cbe_init() calls
436 * clkinitf(), so re-initialize the microstate counters after the
437 * timesource has been chosen.
438 */
439 init_mstate(&t0, LMS_SYSTEM);
440 init_cpu_mstate(CPU, CMS_SYSTEM);
441
442 /*
443 * May need to probe to determine latencies from CPU 0 after
444 * gethrtime() comes alive in cbe_init() and before enabling interrupts
445 * and copy and release any temporary memory allocated with BOP_ALLOC()
446 * before release_bootstrap() frees boot memory
447 */
448 lgrp_init(LGRP_INIT_STAGE3);
449
450 /*
451 * Call all system initialization functions.
452 */
453 for (initptr = &init_tbl[0]; *initptr; initptr++)
454 (**initptr)();
455 /*
456 * Load iSCSI boot properties
457 */
458 ld_ib_prop();
459 /*
460 * initialize vm related stuff.
461 */
462 vm_init();
463
464 /*
465 * initialize buffer pool for raw I/O requests
466 */
467 physio_bufs_init();
468
469 ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */
470
471 /*
472 * Drop the interrupt level and allow interrupts. At this point
473 * the DDI guarantees that interrupts are enabled.
474 */
475 (void) spl0();
476 interrupts_unleashed = 1;
477
478 /*
479 * Create kmem cache for proc structures
480 */
481 process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
482 0, NULL, NULL, NULL, NULL, NULL, 0);
483
484 vfs_mountroot(); /* Mount the root file system */
485 errorq_init(); /* after vfs_mountroot() so DDI root is ready */
486 cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */
487 ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL);
488 /* after vfs_mountroot() so hrestime is valid */
489
490 post_startup();
491 swaploaded = 1;
492
493 /*
494 * Initialize Solaris Audit Subsystem
495 */
496 audit_init();
497
498 /*
499 * Plumb the protocol modules and drivers only if we are not
500 * networked booted, in this case we already did it in rootconf().
501 */
502 if (netboot == 0 && iscsiboot_prop == NULL)
503 (void) strplumb();
504
505 gethrestime(&PTOU(curproc)->u_start);
506 curthread->t_start = PTOU(curproc)->u_start.tv_sec;
507 p->p_mstart = gethrtime();
508
509 /*
510 * Perform setup functions that can only be done after root
511 * and swap have been set up.
512 */
513 consconfig();
514 #ifndef __sparc
515 release_bootstrap();
516 #endif
517
518 /*
519 * attach drivers with ddi-forceattach prop
520 * It must be done early enough to load hotplug drivers (e.g.
521 * pcmcia nexus) so that devices enumerated via hotplug is
522 * available before I/O subsystem is fully initialized.
523 */
524 i_ddi_forceattach_drivers();
525
526 /*
527 * Set the scan rate and other parameters of the paging subsystem.
528 */
529 setupclock(0);
530
531 /*
532 * Initialize process 0's lwp directory and lwpid hash table.
533 */
534 p->p_lwpdir = p->p_lwpfree = p0_lwpdir;
535 p->p_lwpdir->ld_next = p->p_lwpdir + 1;
536 p->p_lwpdir_sz = 2;
537 p->p_tidhash = p0_tidhash;
538 p->p_tidhash_sz = 2;
539 p0_lep.le_thread = curthread;
540 p0_lep.le_lwpid = curthread->t_tid;
541 p0_lep.le_start = curthread->t_start;
542 lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0);
543
544 /*
545 * Initialize extended accounting.
546 */
547 exacct_init();
548
549 /*
550 * Initialize threads of sysevent event channels
551 */
552 sysevent_evc_thrinit();
553
554 /*
555 * This must be done after post_startup() but before
556 * start_other_cpus()
557 */
558 lgrp_init(LGRP_INIT_STAGE4);
559
560 /*
561 * Perform MP initialization, if any.
562 */
563 start_other_cpus(0);
564
565 #ifdef __sparc
566 /*
567 * Release bootstrap here since PROM interfaces are
568 * used to start other CPUs above.
569 */
570 release_bootstrap();
571 #endif
572
573 /*
574 * Finish lgrp initialization after all CPUS are brought online.
575 */
576 lgrp_init(LGRP_INIT_STAGE5);
577
578 /*
579 * After mp_init(), number of cpus are known (this is
580 * true for the time being, when there are actually
581 * hot pluggable cpus then this scheme would not do).
582 * Any per cpu initialization is done here.
583 */
584 kmem_mp_init();
585 vmem_update(NULL);
586
587 clock_tick_init_post();
588
589 for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
590 (**initptr)();
591
592 /*
593 * These must be called after start_other_cpus
594 */
595 pm_cfb_setup_intr();
596 #if defined(__x86)
597 fastboot_post_startup();
598 #endif
599
600 /*
601 * Make init process; enter scheduling loop with system process.
602 *
603 * Note that we manually assign the pids for these processes, for
604 * historical reasons. If more pre-assigned pids are needed,
605 * FAMOUS_PIDS will have to be updated.
606 */
607
608 /* create init process */
609 if (newproc(start_init, NULL, defaultcid, 59, NULL,
610 FAMOUS_PID_INIT))
611 panic("main: unable to fork init.");
612
613 /* create pageout daemon */
614 if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL,
615 FAMOUS_PID_PAGEOUT))
616 panic("main: unable to fork pageout()");
617
618 /* create fsflush daemon */
619 if (newproc(fsflush, NULL, syscid, minclsyspri, NULL,
620 FAMOUS_PID_FSFLUSH))
621 panic("main: unable to fork fsflush()");
622
623 /* create interrupt balancer daemon */
624 if (newproc(intrd, NULL, syscid, minclsyspri, NULL, 0))
625 panic("main: unable to fork intrd()");
626
627 /* create cluster process if we're a member of one */
628 if (cluster_bootflags & CLUSTER_BOOTED) {
629 if (newproc(cluster_wrapper, NULL, syscid, minclsyspri,
630 NULL, 0)) {
631 panic("main: unable to fork cluster()");
632 }
633 }
634
635 /*
636 * Create system threads (threads are associated with p0)
637 */
638
639 /* create module uninstall daemon */
640 /* BugID 1132273. If swapping over NFS need a bigger stack */
641 (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon,
642 NULL, 0, &p0, TS_RUN, minclsyspri);
643
644 (void) thread_create(NULL, 0, seg_pasync_thread,
645 NULL, 0, &p0, TS_RUN, minclsyspri);
646
647 pid_setmin();
648
649 /* system is now ready */
650 mutex_exit(&ualock);
651
652 bcopy("sched", PTOU(curproc)->u_psargs, 6);
653 bcopy("sched", PTOU(curproc)->u_comm, 5);
654 sched();
655 /* NOTREACHED */
656 }