1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1988 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/pcb.h> 33 #include <sys/systm.h> 34 #include <sys/signal.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/proc.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/priocntl.h> 43 #include <sys/procset.h> 44 #include <sys/disp.h> 45 #include <sys/callo.h> 46 #include <sys/callb.h> 47 #include <sys/debug.h> 48 #include <sys/conf.h> 49 #include <sys/bootconf.h> 50 #include <sys/utsname.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmparam.h> 53 #include <sys/modctl.h> 54 #include <sys/vm.h> 55 #include <sys/callb.h> 56 #include <sys/ddi_timer.h> 57 #include <sys/kmem.h> 58 #include <sys/vmem.h> 59 #include <sys/cpuvar.h> 60 #include <sys/cladm.h> 61 #include <sys/corectl.h> 62 #include <sys/exec.h> 63 #include <sys/syscall.h> 64 #include <sys/reboot.h> 65 #include <sys/task.h> 66 #include <sys/exacct.h> 67 #include <sys/autoconf.h> 68 #include <sys/errorq.h> 69 #include <sys/class.h> 70 #include <sys/stack.h> 71 #include <sys/brand.h> 72 #include <sys/mmapobj.h> 73 74 #include <vm/as.h> 75 #include <vm/seg_kmem.h> 76 #include <sys/dc_ki.h> 77 78 #include <c2/audit.h> 79 #include <sys/bootprops.h> 80 81 /* well known processes */ 82 proc_t *proc_sched; /* memory scheduler */ 83 proc_t *proc_init; /* init */ 84 proc_t *proc_pageout; /* pageout daemon */ 85 proc_t *proc_fsflush; /* fsflush daemon */ 86 proc_t *proc_intrd; /* interrupt daemon */ 87 88 pgcnt_t maxmem; /* Maximum available memory in pages. */ 89 pgcnt_t freemem; /* Current available memory in pages. */ 90 int interrupts_unleashed; /* set when we do the first spl0() */ 91 92 kmem_cache_t *process_cache; /* kmem cache for proc structures */ 93 94 /* 95 * Indicates whether the auditing module (c2audit) is loaded. Possible 96 * values are: 97 * 0 - c2audit module is excluded in /etc/system and cannot be loaded 98 * 1 - c2audit module is not loaded but can be anytime 99 * 2 - c2audit module is loaded 100 */ 101 int audit_active = C2AUDIT_DISABLED; 102 103 /* 104 * Process 0's lwp directory and lwpid hash table. 105 */ 106 lwpdir_t p0_lwpdir[2]; 107 tidhash_t p0_tidhash[2]; 108 lwpent_t p0_lep; 109 110 /* 111 * Machine-independent initialization code 112 * Called from cold start routine as 113 * soon as a stack and segmentation 114 * have been established. 115 * Functions: 116 * clear and free user core 117 * turn on clock 118 * hand craft 0th process 119 * call all initialization routines 120 * fork - process 0 to schedule 121 * - process 1 execute bootstrap 122 * - process 2 to page out 123 * create system threads 124 */ 125 126 int cluster_bootflags = 0; 127 128 void 129 cluster_wrapper(void) 130 { 131 cluster(); 132 panic("cluster() returned"); 133 } 134 135 char initname[INITNAME_SZ] = "/sbin/init"; /* also referenced by zone0 */ 136 char initargs[BOOTARGS_MAX] = ""; /* also referenced by zone0 */ 137 138 /* 139 * Construct a stack for init containing the arguments to it, then 140 * pass control to exec_common. 141 */ 142 int 143 exec_init(const char *initpath, const char *args) 144 { 145 caddr32_t ucp; 146 caddr32_t *uap; 147 caddr32_t *argv; 148 caddr32_t exec_fnamep; 149 char *scratchargs; 150 int i, sarg; 151 size_t argvlen, alen; 152 boolean_t in_arg; 153 int argc = 0; 154 int error = 0, count = 0; 155 proc_t *p = ttoproc(curthread); 156 klwp_t *lwp = ttolwp(curthread); 157 int brand_action; 158 159 if (args == NULL) 160 args = ""; 161 162 alen = strlen(initpath) + 1 + strlen(args) + 1; 163 scratchargs = kmem_alloc(alen, KM_SLEEP); 164 (void) snprintf(scratchargs, alen, "%s %s", initpath, args); 165 166 /* 167 * We do a quick two state parse of the string to sort out how big 168 * argc should be. 169 */ 170 in_arg = B_FALSE; 171 for (i = 0; i < strlen(scratchargs); i++) { 172 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { 173 if (in_arg) { 174 in_arg = B_FALSE; 175 argc++; 176 } 177 } else { 178 in_arg = B_TRUE; 179 } 180 } 181 argvlen = sizeof (caddr32_t) * (argc + 1); 182 argv = kmem_zalloc(argvlen, KM_SLEEP); 183 184 /* 185 * We pull off a bit of a hack here. We work our way through the 186 * args string, putting nulls at the ends of space delimited tokens 187 * (boot args don't support quoting at this time). Then we just 188 * copy the whole mess to userland in one go. In other words, we 189 * transform this: "init -s -r\0" into this on the stack: 190 * 191 * -0x00 \0 192 * -0x01 r 193 * -0x02 - <--------. 194 * -0x03 \0 | 195 * -0x04 s | 196 * -0x05 - <------. | 197 * -0x06 \0 | | 198 * -0x07 t | | 199 * -0x08 i | | 200 * -0x09 n | | 201 * -0x0a i <---. | | 202 * -0x10 NULL | | | (argv[3]) 203 * -0x14 -----|--|-' (argv[2]) 204 * -0x18 ------|--' (argv[1]) 205 * -0x1c -------' (argv[0]) 206 * 207 * Since we know the value of ucp at the beginning of this process, 208 * we can trivially compute the argv[] array which we also need to 209 * place in userland: argv[i] = ucp - sarg(i), where ucp is the 210 * stack ptr, and sarg is the string index of the start of the 211 * argument. 212 */ 213 ucp = (caddr32_t)(uintptr_t)p->p_usrstack; 214 215 argc = 0; 216 in_arg = B_FALSE; 217 sarg = 0; 218 219 for (i = 0; i < alen; i++) { 220 if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { 221 if (in_arg == B_TRUE) { 222 in_arg = B_FALSE; 223 scratchargs[i] = '\0'; 224 argv[argc++] = ucp - (alen - sarg); 225 } 226 } else if (in_arg == B_FALSE) { 227 in_arg = B_TRUE; 228 sarg = i; 229 } 230 } 231 ucp -= alen; 232 error |= copyout(scratchargs, (caddr_t)(uintptr_t)ucp, alen); 233 234 uap = (caddr32_t *)P2ALIGN((uintptr_t)ucp, sizeof (caddr32_t)); 235 uap--; /* advance to be below the word we're in */ 236 uap -= (argc + 1); /* advance argc words down, plus one for NULL */ 237 error |= copyout(argv, uap, argvlen); 238 239 if (error != 0) { 240 zcmn_err(p->p_zone->zone_id, CE_WARN, 241 "Could not construct stack for init.\n"); 242 kmem_free(argv, argvlen); 243 kmem_free(scratchargs, alen); 244 return (EFAULT); 245 } 246 247 exec_fnamep = argv[0]; 248 kmem_free(argv, argvlen); 249 kmem_free(scratchargs, alen); 250 251 /* 252 * Point at the arguments. 253 */ 254 lwp->lwp_ap = lwp->lwp_arg; 255 lwp->lwp_arg[0] = (uintptr_t)exec_fnamep; 256 lwp->lwp_arg[1] = (uintptr_t)uap; 257 lwp->lwp_arg[2] = NULL; 258 curthread->t_post_sys = 1; 259 curthread->t_sysnum = SYS_execve; 260 261 /* 262 * If we are executing init from zsched, we may have inherited its 263 * parent process's signal mask. Clear it now so that we behave in 264 * the same way as when started from the global zone. 265 */ 266 sigemptyset(&curthread->t_hold); 267 268 brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; 269 again: 270 error = exec_common((const char *)(uintptr_t)exec_fnamep, 271 (const char **)(uintptr_t)uap, NULL, brand_action); 272 273 /* 274 * Normally we would just set lwp_argsaved and t_post_sys and 275 * let post_syscall reset lwp_ap for us. Unfortunately, 276 * exec_init isn't always called from a system call. Instead 277 * of making a mess of trap_cleanup, we just reset the args 278 * pointer here. 279 */ 280 reset_syscall_args(); 281 282 switch (error) { 283 case 0: 284 return (0); 285 286 case ENOENT: 287 zcmn_err(p->p_zone->zone_id, CE_WARN, 288 "exec(%s) failed (file not found).\n", initpath); 289 return (ENOENT); 290 291 case EAGAIN: 292 case EINTR: 293 ++count; 294 if (count < 5) { 295 zcmn_err(p->p_zone->zone_id, CE_WARN, 296 "exec(%s) failed with errno %d. Retrying...\n", 297 initpath, error); 298 goto again; 299 } 300 } 301 302 zcmn_err(p->p_zone->zone_id, CE_WARN, 303 "exec(%s) failed with errno %d.", initpath, error); 304 return (error); 305 } 306 307 /* 308 * This routine does all of the common setup for invoking init; global 309 * and non-global zones employ this routine for the functionality which is 310 * in common. 311 * 312 * This program (init, presumably) must be a 32-bit process. 313 */ 314 int 315 start_init_common() 316 { 317 proc_t *p = curproc; 318 ASSERT_STACK_ALIGNED(); 319 p->p_zone->zone_proc_initpid = p->p_pid; 320 321 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 322 p->p_usrstack = (caddr_t)USRSTACK32; 323 p->p_model = DATAMODEL_ILP32; 324 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 325 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 326 p->p_stk_ctl = INT32_MAX; 327 328 p->p_as = as_alloc(); 329 p->p_as->a_proc = p; 330 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 331 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 332 333 init_core(); 334 335 init_mstate(curthread, LMS_SYSTEM); 336 return (exec_init(p->p_zone->zone_initname, p->p_zone->zone_bootargs)); 337 } 338 339 /* 340 * Start the initial user process for the global zone; once running, if 341 * init should subsequently fail, it will be automatically be caught in the 342 * exit(2) path, and restarted by restart_init(). 343 */ 344 static void 345 start_init(void) 346 { 347 proc_init = curproc; 348 349 ASSERT(curproc->p_zone->zone_initname != NULL); 350 351 if (start_init_common() != 0) 352 halt("unix: Could not start init"); 353 lwp_rtt(); 354 } 355 356 void 357 main(void) 358 { 359 proc_t *p = ttoproc(curthread); /* &p0 */ 360 int (**initptr)(); 361 extern void sched(); 362 extern void fsflush(); 363 extern void intrd(); 364 extern int (*init_tbl[])(); 365 extern int (*mp_init_tbl[])(); 366 extern id_t syscid, defaultcid; 367 extern int swaploaded; 368 extern int netboot; 369 extern ib_boot_prop_t *iscsiboot_prop; 370 extern void vm_init(void); 371 extern void cbe_init_pre(void); 372 extern void cbe_init(void); 373 extern void clock_tick_init_pre(void); 374 extern void clock_tick_init_post(void); 375 extern void clock_init(void); 376 extern void physio_bufs_init(void); 377 extern void pm_cfb_setup_intr(void); 378 extern int pm_adjust_timestamps(dev_info_t *, void *); 379 extern void start_other_cpus(int); 380 extern void sysevent_evc_thrinit(); 381 extern kmutex_t ualock; 382 #if defined(__x86) 383 extern void fastboot_post_startup(void); 384 extern void progressbar_start(void); 385 #endif 386 /* 387 * In the horrible world of x86 in-lines, you can't get symbolic 388 * structure offsets a la genassym. This assertion is here so 389 * that the next poor slob who innocently changes the offset of 390 * cpu_thread doesn't waste as much time as I just did finding 391 * out that it's hard-coded in i86/ml/i86.il. Similarly for 392 * curcpup. You're welcome. 393 */ 394 ASSERT(CPU == CPU->cpu_self); 395 ASSERT(curthread == CPU->cpu_thread); 396 ASSERT_STACK_ALIGNED(); 397 398 /* 399 * We take the ualock until we have completed the startup 400 * to prevent kadmin() from disrupting this work. In particular, 401 * we don't want kadmin() to bring the system down while we are 402 * trying to start it up. 403 */ 404 mutex_enter(&ualock); 405 406 /* 407 * Setup root lgroup and leaf lgroup for CPU 0 408 */ 409 lgrp_init(LGRP_INIT_STAGE2); 410 411 /* 412 * Once 'startup()' completes, the thread_reaper() daemon would be 413 * created(in thread_init()). After that, it is safe to create threads 414 * that could exit. These exited threads will get reaped. 415 */ 416 startup(); 417 segkmem_gc(); 418 callb_init(); 419 cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */ 420 timer_init(); /* timer must be initialized before cyclic starts */ 421 cbe_init(); 422 callout_init(); /* callout table MUST be init'd after cyclics */ 423 clock_tick_init_pre(); 424 clock_init(); 425 426 #if defined(__x86) 427 /* 428 * The progressbar thread uses cv_reltimedwait() and hence needs to be 429 * started after the callout mechanism has been initialized. 430 */ 431 progressbar_start(); 432 #endif 433 /* 434 * On some platforms, clkinitf() changes the timing source that 435 * gethrtime_unscaled() uses to generate timestamps. cbe_init() calls 436 * clkinitf(), so re-initialize the microstate counters after the 437 * timesource has been chosen. 438 */ 439 init_mstate(&t0, LMS_SYSTEM); 440 init_cpu_mstate(CPU, CMS_SYSTEM); 441 442 /* 443 * May need to probe to determine latencies from CPU 0 after 444 * gethrtime() comes alive in cbe_init() and before enabling interrupts 445 * and copy and release any temporary memory allocated with BOP_ALLOC() 446 * before release_bootstrap() frees boot memory 447 */ 448 lgrp_init(LGRP_INIT_STAGE3); 449 450 /* 451 * Call all system initialization functions. 452 */ 453 for (initptr = &init_tbl[0]; *initptr; initptr++) 454 (**initptr)(); 455 /* 456 * Load iSCSI boot properties 457 */ 458 ld_ib_prop(); 459 /* 460 * initialize vm related stuff. 461 */ 462 vm_init(); 463 464 /* 465 * initialize buffer pool for raw I/O requests 466 */ 467 physio_bufs_init(); 468 469 ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */ 470 471 /* 472 * Drop the interrupt level and allow interrupts. At this point 473 * the DDI guarantees that interrupts are enabled. 474 */ 475 (void) spl0(); 476 interrupts_unleashed = 1; 477 478 /* 479 * Create kmem cache for proc structures 480 */ 481 process_cache = kmem_cache_create("process_cache", sizeof (proc_t), 482 0, NULL, NULL, NULL, NULL, NULL, 0); 483 484 vfs_mountroot(); /* Mount the root file system */ 485 errorq_init(); /* after vfs_mountroot() so DDI root is ready */ 486 cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */ 487 ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL); 488 /* after vfs_mountroot() so hrestime is valid */ 489 490 post_startup(); 491 swaploaded = 1; 492 493 /* 494 * Initialize Solaris Audit Subsystem 495 */ 496 audit_init(); 497 498 /* 499 * Plumb the protocol modules and drivers only if we are not 500 * networked booted, in this case we already did it in rootconf(). 501 */ 502 if (netboot == 0 && iscsiboot_prop == NULL) 503 (void) strplumb(); 504 505 gethrestime(&PTOU(curproc)->u_start); 506 curthread->t_start = PTOU(curproc)->u_start.tv_sec; 507 p->p_mstart = gethrtime(); 508 509 /* 510 * Perform setup functions that can only be done after root 511 * and swap have been set up. 512 */ 513 consconfig(); 514 #ifndef __sparc 515 release_bootstrap(); 516 #endif 517 518 /* 519 * attach drivers with ddi-forceattach prop 520 * It must be done early enough to load hotplug drivers (e.g. 521 * pcmcia nexus) so that devices enumerated via hotplug is 522 * available before I/O subsystem is fully initialized. 523 */ 524 i_ddi_forceattach_drivers(); 525 526 /* 527 * Set the scan rate and other parameters of the paging subsystem. 528 */ 529 setupclock(0); 530 531 /* 532 * Initialize process 0's lwp directory and lwpid hash table. 533 */ 534 p->p_lwpdir = p->p_lwpfree = p0_lwpdir; 535 p->p_lwpdir->ld_next = p->p_lwpdir + 1; 536 p->p_lwpdir_sz = 2; 537 p->p_tidhash = p0_tidhash; 538 p->p_tidhash_sz = 2; 539 p0_lep.le_thread = curthread; 540 p0_lep.le_lwpid = curthread->t_tid; 541 p0_lep.le_start = curthread->t_start; 542 lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0); 543 544 /* 545 * Initialize extended accounting. 546 */ 547 exacct_init(); 548 549 /* 550 * Initialize threads of sysevent event channels 551 */ 552 sysevent_evc_thrinit(); 553 554 /* 555 * This must be done after post_startup() but before 556 * start_other_cpus() 557 */ 558 lgrp_init(LGRP_INIT_STAGE4); 559 560 /* 561 * Perform MP initialization, if any. 562 */ 563 start_other_cpus(0); 564 565 #ifdef __sparc 566 /* 567 * Release bootstrap here since PROM interfaces are 568 * used to start other CPUs above. 569 */ 570 release_bootstrap(); 571 #endif 572 573 /* 574 * Finish lgrp initialization after all CPUS are brought online. 575 */ 576 lgrp_init(LGRP_INIT_STAGE5); 577 578 /* 579 * After mp_init(), number of cpus are known (this is 580 * true for the time being, when there are actually 581 * hot pluggable cpus then this scheme would not do). 582 * Any per cpu initialization is done here. 583 */ 584 kmem_mp_init(); 585 vmem_update(NULL); 586 587 clock_tick_init_post(); 588 589 for (initptr = &mp_init_tbl[0]; *initptr; initptr++) 590 (**initptr)(); 591 592 /* 593 * These must be called after start_other_cpus 594 */ 595 pm_cfb_setup_intr(); 596 #if defined(__x86) 597 fastboot_post_startup(); 598 #endif 599 600 /* 601 * Make init process; enter scheduling loop with system process. 602 * 603 * Note that we manually assign the pids for these processes, for 604 * historical reasons. If more pre-assigned pids are needed, 605 * FAMOUS_PIDS will have to be updated. 606 */ 607 608 /* create init process */ 609 if (newproc(start_init, NULL, defaultcid, 59, NULL, 610 FAMOUS_PID_INIT)) 611 panic("main: unable to fork init."); 612 613 /* create pageout daemon */ 614 if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL, 615 FAMOUS_PID_PAGEOUT)) 616 panic("main: unable to fork pageout()"); 617 618 /* create fsflush daemon */ 619 if (newproc(fsflush, NULL, syscid, minclsyspri, NULL, 620 FAMOUS_PID_FSFLUSH)) 621 panic("main: unable to fork fsflush()"); 622 623 /* create interrupt balancer daemon */ 624 if (newproc(intrd, NULL, syscid, minclsyspri, NULL, 0)) 625 panic("main: unable to fork intrd()"); 626 627 /* create cluster process if we're a member of one */ 628 if (cluster_bootflags & CLUSTER_BOOTED) { 629 if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, 630 NULL, 0)) { 631 panic("main: unable to fork cluster()"); 632 } 633 } 634 635 /* 636 * Create system threads (threads are associated with p0) 637 */ 638 639 /* create module uninstall daemon */ 640 /* BugID 1132273. If swapping over NFS need a bigger stack */ 641 (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon, 642 NULL, 0, &p0, TS_RUN, minclsyspri); 643 644 (void) thread_create(NULL, 0, seg_pasync_thread, 645 NULL, 0, &p0, TS_RUN, minclsyspri); 646 647 pid_setmin(); 648 649 /* system is now ready */ 650 mutex_exit(&ualock); 651 652 bcopy("sched", PTOU(curproc)->u_psargs, 6); 653 bcopy("sched", PTOU(curproc)->u_comm, 5); 654 sched(); 655 /* NOTREACHED */ 656 }