illumos Wdiff usr/src/uts/i86pc/os/startup.c

Print this page

OS-2366 ddi_periodic_add(9F) is entirely rubbish

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/startup.c
          +++ new/usr/src/uts/i86pc/os/startup.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25   25   */
  26   26  /*
  27   27   * Copyright (c) 2010, Intel Corporation.
  28   28   * All rights reserved.
  29   29   */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/t_lock.h>
  33   33  #include <sys/param.h>
  34   34  #include <sys/sysmacros.h>
  35   35  #include <sys/signal.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/user.h>
  38   38  #include <sys/mman.h>
  39   39  #include <sys/vm.h>
  40   40  #include <sys/conf.h>
  41   41  #include <sys/avintr.h>
  42   42  #include <sys/autoconf.h>
  43   43  #include <sys/disp.h>
  44   44  #include <sys/class.h>
  45   45  #include <sys/bitmap.h>
  46   46  
  47   47  #include <sys/privregs.h>
  48   48  
  49   49  #include <sys/proc.h>
  50   50  #include <sys/buf.h>
  51   51  #include <sys/kmem.h>
  52   52  #include <sys/mem.h>
  53   53  #include <sys/kstat.h>
  54   54  
  55   55  #include <sys/reboot.h>
  56   56  
  57   57  #include <sys/cred.h>
  58   58  #include <sys/vnode.h>
  59   59  #include <sys/file.h>
  60   60  
  61   61  #include <sys/procfs.h>
  62   62  
  63   63  #include <sys/vfs.h>
  64   64  #include <sys/cmn_err.h>
  65   65  #include <sys/utsname.h>
  66   66  #include <sys/debug.h>
  67   67  #include <sys/kdi.h>
  68   68  
  69   69  #include <sys/dumphdr.h>
  70   70  #include <sys/bootconf.h>
  71   71  #include <sys/memlist_plat.h>
  72   72  #include <sys/varargs.h>
  73   73  #include <sys/promif.h>
  74   74  #include <sys/modctl.h>
  75   75  
  76   76  #include <sys/sunddi.h>
  77   77  #include <sys/sunndi.h>
  78   78  #include <sys/ndi_impldefs.h>
  79   79  #include <sys/ddidmareq.h>
  80   80  #include <sys/psw.h>
  81   81  #include <sys/regset.h>
  82   82  #include <sys/clock.h>
  83   83  #include <sys/pte.h>
  84   84  #include <sys/tss.h>
  85   85  #include <sys/stack.h>
  86   86  #include <sys/trap.h>
  87   87  #include <sys/fp.h>
  88   88  #include <vm/kboot_mmu.h>
  89   89  #include <vm/anon.h>
  90   90  #include <vm/as.h>
  91   91  #include <vm/page.h>
  92   92  #include <vm/seg.h>
  93   93  #include <vm/seg_dev.h>
  94   94  #include <vm/seg_kmem.h>
  95   95  #include <vm/seg_kpm.h>
  96   96  #include <vm/seg_map.h>
  97   97  #include <vm/seg_vn.h>
  98   98  #include <vm/seg_kp.h>
  99   99  #include <sys/memnode.h>
 100  100  #include <vm/vm_dep.h>
 101  101  #include <sys/thread.h>
 102  102  #include <sys/sysconf.h>
 103  103  #include <sys/vm_machparam.h>
 104  104  #include <sys/archsystm.h>
 105  105  #include <sys/machsystm.h>
 106  106  #include <vm/hat.h>
 107  107  #include <vm/hat_i86.h>
 108  108  #include <sys/pmem.h>
 109  109  #include <sys/smp_impldefs.h>
 110  110  #include <sys/x86_archext.h>
 111  111  #include <sys/cpuvar.h>
 112  112  #include <sys/segments.h>
 113  113  #include <sys/clconf.h>
 114  114  #include <sys/kobj.h>
 115  115  #include <sys/kobj_lex.h>
 116  116  #include <sys/cpc_impl.h>
 117  117  #include <sys/cpu_module.h>
 118  118  #include <sys/smbios.h>
 119  119  #include <sys/debug_info.h>
 120  120  #include <sys/bootinfo.h>
 121  121  #include <sys/ddi_timer.h>
 122  122  #include <sys/systeminfo.h>
 123  123  #include <sys/multiboot.h>
 124  124  
 125  125  #ifdef  __xpv
 126  126  
 127  127  #include <sys/hypervisor.h>
 128  128  #include <sys/xen_mmu.h>
 129  129  #include <sys/evtchn_impl.h>
 130  130  #include <sys/gnttab.h>
 131  131  #include <sys/xpv_panic.h>
 132  132  #include <xen/sys/xenbus_comms.h>
 133  133  #include <xen/public/physdev.h>
 134  134  
 135  135  extern void xen_late_startup(void);
 136  136  
 137  137  struct xen_evt_data cpu0_evt_data;
 138  138  
 139  139  #else   /* __xpv */
 140  140  #include <sys/memlist_impl.h>
 141  141  
 142  142  extern void mem_config_init(void);
 143  143  #endif /* __xpv */
 144  144  
 145  145  extern void progressbar_init(void);
 146  146  extern void brand_init(void);
 147  147  extern void pcf_init(void);
 148  148  extern void pg_init(void);
 149  149  
 150  150  extern int size_pse_array(pgcnt_t, int);
 151  151  
 152  152  #if defined(_SOFT_HOSTID)
 153  153  
 154  154  #include <sys/rtc.h>
 155  155  
 156  156  static int32_t set_soft_hostid(void);
 157  157  static char hostid_file[] = "/etc/hostid";
 158  158  
 159  159  #endif
 160  160  
 161  161  void *gfx_devinfo_list;
 162  162  
 163  163  #if defined(__amd64) && !defined(__xpv)
 164  164  extern void immu_startup(void);
 165  165  #endif
 166  166  
 167  167  /*
 168  168   * XXX make declaration below "static" when drivers no longer use this
 169  169   * interface.
 170  170   */
 171  171  extern caddr_t p0_va;   /* Virtual address for accessing physical page 0 */
 172  172  
 173  173  /*
 174  174   * segkp
 175  175   */
 176  176  extern int segkp_fromheap;
 177  177  
 178  178  static void kvm_init(void);
 179  179  static void startup_init(void);
 180  180  static void startup_memlist(void);
 181  181  static void startup_kmem(void);
 182  182  static void startup_modules(void);
 183  183  static void startup_vm(void);
 184  184  static void startup_end(void);
 185  185  static void layout_kernel_va(void);
 186  186  
 187  187  /*
 188  188   * Declare these as initialized data so we can patch them.
 189  189   */
 190  190  #ifdef __i386
 191  191  
 192  192  /*
 193  193   * Due to virtual address space limitations running in 32 bit mode, restrict
 194  194   * the amount of physical memory configured to a max of PHYSMEM pages (16g).
 195  195   *
 196  196   * If the physical max memory size of 64g were allowed to be configured, the
 197  197   * size of user virtual address space will be less than 1g. A limited user
 198  198   * address space greatly reduces the range of applications that can run.
 199  199   *
 200  200   * If more physical memory than PHYSMEM is required, users should preferably
 201  201   * run in 64 bit mode which has far looser virtual address space limitations.
 202  202   *
 203  203   * If 64 bit mode is not available (as in IA32) and/or more physical memory
 204  204   * than PHYSMEM is required in 32 bit mode, physmem can be set to the desired
 205  205   * value or to 0 (to configure all available memory) via eeprom(1M). kernelbase
 206  206   * should also be carefully tuned to balance out the need of the user
 207  207   * application while minimizing the risk of kernel heap exhaustion due to
 208  208   * kernelbase being set too high.
 209  209   */
 210  210  #define PHYSMEM 0x400000
 211  211  
 212  212  #else /* __amd64 */
 213  213  
 214  214  /*
 215  215   * For now we can handle memory with physical addresses up to about
 216  216   * 64 Terabytes. This keeps the kernel above the VA hole, leaving roughly
 217  217   * half the VA space for seg_kpm. When systems get bigger than 64TB this
 218  218   * code will need revisiting. There is an implicit assumption that there
 219  219   * are no *huge* holes in the physical address space too.
 220  220   */
 221  221  #define TERABYTE                (1ul << 40)
 222  222  #define PHYSMEM_MAX64           mmu_btop(64 * TERABYTE)
 223  223  #define PHYSMEM                 PHYSMEM_MAX64
 224  224  #define AMD64_VA_HOLE_END       0xFFFF800000000000ul
 225  225  
 226  226  #endif /* __amd64 */
 227  227  
 228  228  pgcnt_t physmem = PHYSMEM;
 229  229  pgcnt_t obp_pages;      /* Memory used by PROM for its text and data */
 230  230  
 231  231  char *kobj_file_buf;
 232  232  int kobj_file_bufsize;  /* set in /etc/system */
 233  233  
 234  234  /* Global variables for MP support. Used in mp_startup */
 235  235  caddr_t rm_platter_va = 0;
 236  236  uint32_t rm_platter_pa;
 237  237  
 238  238  int     auto_lpg_disable = 1;
 239  239  
 240  240  /*
 241  241   * Some CPUs have holes in the middle of the 64-bit virtual address range.
 242  242   */
 243  243  uintptr_t hole_start, hole_end;
 244  244  
 245  245  /*
 246  246   * kpm mapping window
 247  247   */
 248  248  caddr_t kpm_vbase;
 249  249  size_t  kpm_size;
 250  250  static int kpm_desired;
 251  251  #ifdef __amd64
 252  252  static uintptr_t segkpm_base = (uintptr_t)SEGKPM_BASE;
 253  253  #endif
 254  254  
 255  255  /*
 256  256   * Configuration parameters set at boot time.
 257  257   */
 258  258  
 259  259  caddr_t econtig;                /* end of first block of contiguous kernel */
 260  260  
 261  261  struct bootops          *bootops = 0;   /* passed in from boot */
 262  262  struct bootops          **bootopsp;
 263  263  struct boot_syscalls    *sysp;          /* passed in from boot */
 264  264  
 265  265  char bootblock_fstype[16];
 266  266  
 267  267  char kern_bootargs[OBP_MAXPATHLEN];
 268  268  char kern_bootfile[OBP_MAXPATHLEN];
 269  269  
 270  270  /*
 271  271   * ZFS zio segment.  This allows us to exclude large portions of ZFS data that
 272  272   * gets cached in kmem caches on the heap.  If this is set to zero, we allocate
 273  273   * zio buffers from their own segment, otherwise they are allocated from the
 274  274   * heap.  The optimization of allocating zio buffers from their own segment is
 275  275   * only valid on 64-bit kernels.
 276  276   */
 277  277  #if defined(__amd64)
 278  278  int segzio_fromheap = 0;
 279  279  #else
 280  280  int segzio_fromheap = 1;
 281  281  #endif
 282  282  
 283  283  /*
 284  284   * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
 285  285   * depends on number of BOP_ALLOC calls made and requested size, memory size
 286  286   * combination and whether boot.bin memory needs to be freed.
 287  287   */
 288  288  #define POSS_NEW_FRAGMENTS      12
 289  289  
 290  290  /*
 291  291   * VM data structures
 292  292   */
 293  293  long page_hashsz;               /* Size of page hash table (power of two) */
 294  294  unsigned int page_hashsz_shift; /* log2(page_hashsz) */
 295  295  struct page *pp_base;           /* Base of initial system page struct array */
 296  296  struct page **page_hash;        /* Page hash table */
 297  297  pad_mutex_t *pse_mutex;         /* Locks protecting pp->p_selock */
 298  298  size_t pse_table_size;          /* Number of mutexes in pse_mutex[] */
 299  299  int pse_shift;                  /* log2(pse_table_size) */
 300  300  struct seg ktextseg;            /* Segment used for kernel executable image */
 301  301  struct seg kvalloc;             /* Segment used for "valloc" mapping */
 302  302  struct seg kpseg;               /* Segment used for pageable kernel virt mem */
 303  303  struct seg kmapseg;             /* Segment used for generic kernel mappings */
 304  304  struct seg kdebugseg;           /* Segment used for the kernel debugger */
 305  305  
 306  306  struct seg *segkmap = &kmapseg; /* Kernel generic mapping segment */
 307  307  static struct seg *segmap = &kmapseg;   /* easier to use name for in here */
 308  308  
 309  309  struct seg *segkp = &kpseg;     /* Pageable kernel virtual memory segment */
 310  310  
 311  311  #if defined(__amd64)
 312  312  struct seg kvseg_core;          /* Segment used for the core heap */
 313  313  struct seg kpmseg;              /* Segment used for physical mapping */
 314  314  struct seg *segkpm = &kpmseg;   /* 64bit kernel physical mapping segment */
 315  315  #else
 316  316  struct seg *segkpm = NULL;      /* Unused on IA32 */
 317  317  #endif
 318  318  
 319  319  caddr_t segkp_base;             /* Base address of segkp */
 320  320  caddr_t segzio_base;            /* Base address of segzio */
 321  321  #if defined(__amd64)
 322  322  pgcnt_t segkpsize = btop(SEGKPDEFSIZE); /* size of segkp segment in pages */
 323  323  #else
 324  324  pgcnt_t segkpsize = 0;
 325  325  #endif
 326  326  pgcnt_t segziosize = 0;         /* size of zio segment in pages */
 327  327  
 328  328  /*
 329  329   * A static DR page_t VA map is reserved that can map the page structures
 330  330   * for a domain's entire RA space. The pages that back this space are
 331  331   * dynamically allocated and need not be physically contiguous.  The DR
 332  332   * map size is derived from KPM size.
 333  333   * This mechanism isn't used by x86 yet, so just stubs here.
 334  334   */
 335  335  int ppvm_enable = 0;            /* Static virtual map for page structs */
 336  336  page_t *ppvm_base = NULL;       /* Base of page struct map */
 337  337  pgcnt_t ppvm_size = 0;          /* Size of page struct map */
 338  338  
 339  339  /*
 340  340   * VA range available to the debugger
 341  341   */
 342  342  const caddr_t kdi_segdebugbase = (const caddr_t)SEGDEBUGBASE;
 343  343  const size_t kdi_segdebugsize = SEGDEBUGSIZE;
 344  344  
 345  345  struct memseg *memseg_base;
 346  346  struct vnode unused_pages_vp;
 347  347  
 348  348  #define FOURGB  0x100000000LL
 349  349  
 350  350  struct memlist *memlist;
 351  351  
 352  352  caddr_t s_text;         /* start of kernel text segment */
 353  353  caddr_t e_text;         /* end of kernel text segment */
 354  354  caddr_t s_data;         /* start of kernel data segment */
 355  355  caddr_t e_data;         /* end of kernel data segment */
 356  356  caddr_t modtext;        /* start of loadable module text reserved */
 357  357  caddr_t e_modtext;      /* end of loadable module text reserved */
 358  358  caddr_t moddata;        /* start of loadable module data reserved */
 359  359  caddr_t e_moddata;      /* end of loadable module data reserved */
 360  360  
 361  361  struct memlist *phys_install;   /* Total installed physical memory */
 362  362  struct memlist *phys_avail;     /* Total available physical memory */
 363  363  struct memlist *bios_rsvd;      /* Bios reserved memory */
 364  364  
 365  365  /*
 366  366   * kphysm_init returns the number of pages that were processed
 367  367   */
 368  368  static pgcnt_t kphysm_init(page_t *, pgcnt_t);
 369  369  
 370  370  #define IO_PROP_SIZE    64      /* device property size */
 371  371  
 372  372  /*
 373  373   * a couple useful roundup macros
 374  374   */
 375  375  #define ROUND_UP_PAGE(x)        \
 376  376          ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)MMU_PAGESIZE))
 377  377  #define ROUND_UP_LPAGE(x)       \
 378  378          ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[1]))
 379  379  #define ROUND_UP_4MEG(x)        \
 380  380          ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)FOUR_MEG))
 381  381  #define ROUND_UP_TOPLEVEL(x)    \
 382  382          ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[mmu.max_level]))
 383  383  
 384  384  /*
 385  385   *      32-bit Kernel's Virtual memory layout.
 386  386   *              +-----------------------+
 387  387   *              |                       |
 388  388   * 0xFFC00000  -|-----------------------|- ARGSBASE
 389  389   *              |       debugger        |
 390  390   * 0xFF800000  -|-----------------------|- SEGDEBUGBASE
 391  391   *              |      Kernel Data      |
 392  392   * 0xFEC00000  -|-----------------------|
 393  393   *              |      Kernel Text      |
 394  394   * 0xFE800000  -|-----------------------|- KERNEL_TEXT (0xFB400000 on Xen)
 395  395   *              |---       GDT       ---|- GDT page (GDT_VA)
 396  396   *              |---    debug info   ---|- debug info (DEBUG_INFO_VA)
 397  397   *              |                       |
 398  398   *              |   page_t structures   |
 399  399   *              |   memsegs, memlists,  |
 400  400   *              |   page hash, etc.     |
 401  401   * ---         -|-----------------------|- ekernelheap, valloc_base (floating)
 402  402   *              |                       |  (segkp is just an arena in the heap)
 403  403   *              |                       |
 404  404   *              |       kvseg           |
 405  405   *              |                       |
 406  406   *              |                       |
 407  407   * ---         -|-----------------------|- kernelheap (floating)
 408  408   *              |        Segkmap        |
 409  409   * 0xC3002000  -|-----------------------|- segmap_start (floating)
 410  410   *              |       Red Zone        |
 411  411   * 0xC3000000  -|-----------------------|- kernelbase / userlimit (floating)
 412  412   *              |                       |                       ||
 413  413   *              |     Shared objects    |                       \/
 414  414   *              |                       |
 415  415   *              :                       :
 416  416   *              |       user data       |
 417  417   *              |-----------------------|
 418  418   *              |       user text       |
 419  419   * 0x08048000  -|-----------------------|
 420  420   *              |       user stack      |
 421  421   *              :                       :
 422  422   *              |       invalid         |
 423  423   * 0x00000000   +-----------------------+
 424  424   *
 425  425   *
 426  426   *              64-bit Kernel's Virtual memory layout. (assuming 64 bit app)
 427  427   *                      +-----------------------+
 428  428   *                      |                       |
 429  429   * 0xFFFFFFFF.FFC00000  |-----------------------|- ARGSBASE
 430  430   *                      |       debugger (?)    |
 431  431   * 0xFFFFFFFF.FF800000  |-----------------------|- SEGDEBUGBASE
 432  432   *                      |      unused           |
 433  433   *                      +-----------------------+
 434  434   *                      |      Kernel Data      |
 435  435   * 0xFFFFFFFF.FBC00000  |-----------------------|
 436  436   *                      |      Kernel Text      |
 437  437   * 0xFFFFFFFF.FB800000  |-----------------------|- KERNEL_TEXT
 438  438   *                      |---       GDT       ---|- GDT page (GDT_VA)
 439  439   *                      |---    debug info   ---|- debug info (DEBUG_INFO_VA)
 440  440   *                      |                       |
 441  441   *                      |      Core heap        | (used for loadable modules)
 442  442   * 0xFFFFFFFF.C0000000  |-----------------------|- core_base / ekernelheap
 443  443   *                      |        Kernel         |
 444  444   *                      |         heap          |
 445  445   * 0xFFFFFXXX.XXX00000  |-----------------------|- kernelheap (floating)
 446  446   *                      |        segmap         |
 447  447   * 0xFFFFFXXX.XXX00000  |-----------------------|- segmap_start (floating)
 448  448   *                      |    device mappings    |
 449  449   * 0xFFFFFXXX.XXX00000  |-----------------------|- toxic_addr (floating)
 450  450   *                      |         segzio        |
 451  451   * 0xFFFFFXXX.XXX00000  |-----------------------|- segzio_base (floating)
 452  452   *                      |         segkp         |
 453  453   * ---                  |-----------------------|- segkp_base (floating)
 454  454   *                      |   page_t structures   |  valloc_base + valloc_sz
 455  455   *                      |   memsegs, memlists,  |
 456  456   *                      |   page hash, etc.     |
 457  457   * 0xFFFFFF00.00000000  |-----------------------|- valloc_base (lower if > 1TB)
 458  458   *                      |        segkpm         |
 459  459   * 0xFFFFFE00.00000000  |-----------------------|
 460  460   *                      |       Red Zone        |
 461  461   * 0xFFFFFD80.00000000  |-----------------------|- KERNELBASE (lower if > 1TB)
 462  462   *                      |     User stack        |- User space memory
 463  463   *                      |                       |
 464  464   *                      | shared objects, etc   |       (grows downwards)
 465  465   *                      :                       :
 466  466   *                      |                       |
 467  467   * 0xFFFF8000.00000000  |-----------------------|
 468  468   *                      |                       |
 469  469   *                      | VA Hole / unused      |
 470  470   *                      |                       |
 471  471   * 0x00008000.00000000  |-----------------------|
 472  472   *                      |                       |
 473  473   *                      |                       |
 474  474   *                      :                       :
 475  475   *                      |       user heap       |       (grows upwards)
 476  476   *                      |                       |
 477  477   *                      |       user data       |
 478  478   *                      |-----------------------|
 479  479   *                      |       user text       |
 480  480   * 0x00000000.04000000  |-----------------------|
 481  481   *                      |       invalid         |
 482  482   * 0x00000000.00000000  +-----------------------+
 483  483   *
 484  484   * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit
 485  485   * kernel, except that userlimit is raised to 0xfe000000
 486  486   *
 487  487   * Floating values:
 488  488   *
 489  489   * valloc_base: start of the kernel's memory management/tracking data
 490  490   * structures.  This region contains page_t structures for
 491  491   * physical memory, memsegs, memlists, and the page hash.
 492  492   *
 493  493   * core_base: start of the kernel's "core" heap area on 64-bit systems.
 494  494   * This area is intended to be used for global data as well as for module
 495  495   * text/data that does not fit into the nucleus pages.  The core heap is
 496  496   * restricted to a 2GB range, allowing every address within it to be
 497  497   * accessed using rip-relative addressing
 498  498   *
 499  499   * ekernelheap: end of kernelheap and start of segmap.
 500  500   *
 501  501   * kernelheap: start of kernel heap.  On 32-bit systems, this starts right
 502  502   * above a red zone that separates the user's address space from the
 503  503   * kernel's.  On 64-bit systems, it sits above segkp and segkpm.
 504  504   *
 505  505   * segmap_start: start of segmap. The length of segmap can be modified
 506  506   * through eeprom. The default length is 16MB on 32-bit systems and 64MB
 507  507   * on 64-bit systems.
 508  508   *
 509  509   * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be
 510  510   * decreased by 2X the size required for page_t.  This allows the kernel
 511  511   * heap to grow in size with physical memory.  With sizeof(page_t) == 80
 512  512   * bytes, the following shows the values of kernelbase and kernel heap
 513  513   * sizes for different memory configurations (assuming default segmap and
 514  514   * segkp sizes).
 515  515   *
 516  516   *      mem     size for        kernelbase      kernel heap
 517  517   *      size    page_t's                        size
 518  518   *      ----    ---------       ----------      -----------
 519  519   *      1gb     0x01400000      0xd1800000      684MB
 520  520   *      2gb     0x02800000      0xcf000000      704MB
 521  521   *      4gb     0x05000000      0xca000000      744MB
 522  522   *      6gb     0x07800000      0xc5000000      784MB
 523  523   *      8gb     0x0a000000      0xc0000000      824MB
 524  524   *      16gb    0x14000000      0xac000000      984MB
 525  525   *      32gb    0x28000000      0x84000000      1304MB
 526  526   *      64gb    0x50000000      0x34000000      1944MB (*)
 527  527   *
 528  528   * kernelbase is less than the abi minimum of 0xc0000000 for memory
 529  529   * configurations above 8gb.
 530  530   *
 531  531   * (*) support for memory configurations above 32gb will require manual tuning
 532  532   * of kernelbase to balance out the need of user applications.
 533  533   */
 534  534  
 535  535  /* real-time-clock initialization parameters */
 536  536  extern time_t process_rtc_config_file(void);
 537  537  
 538  538  uintptr_t       kernelbase;
 539  539  uintptr_t       postbootkernelbase;     /* not set till boot loader is gone */
 540  540  uintptr_t       eprom_kernelbase;
 541  541  size_t          segmapsize;
 542  542  uintptr_t       segmap_start;
 543  543  int             segmapfreelists;
 544  544  pgcnt_t         npages;
 545  545  pgcnt_t         orig_npages;
 546  546  size_t          core_size;              /* size of "core" heap */
 547  547  uintptr_t       core_base;              /* base address of "core" heap */
 548  548  
 549  549  /*
 550  550   * List of bootstrap pages. We mark these as allocated in startup.
 551  551   * release_bootstrap() will free them when we're completely done with
 552  552   * the bootstrap.
 553  553   */
 554  554  static page_t *bootpages;
 555  555  
 556  556  /*
 557  557   * boot time pages that have a vnode from the ramdisk will keep that forever.
 558  558   */
 559  559  static page_t *rd_pages;
 560  560  
 561  561  /*
 562  562   * Lower 64K
 563  563   */
 564  564  static page_t *lower_pages = NULL;
 565  565  static int lower_pages_count = 0;
 566  566  
 567  567  struct system_hardware system_hardware;
 568  568  
 569  569  /*
 570  570   * Enable some debugging messages concerning memory usage...
 571  571   */
 572  572  static void
 573  573  print_memlist(char *title, struct memlist *mp)
 574  574  {
 575  575          prom_printf("MEMLIST: %s:\n", title);
 576  576          while (mp != NULL)  {
 577  577                  prom_printf("\tAddress 0x%" PRIx64 ", size 0x%" PRIx64 "\n",
 578  578                      mp->ml_address, mp->ml_size);
 579  579                  mp = mp->ml_next;
 580  580          }
 581  581  }
 582  582  
 583  583  /*
 584  584   * XX64 need a comment here.. are these just default values, surely
 585  585   * we read the "cpuid" type information to figure this out.
 586  586   */
 587  587  int     l2cache_sz = 0x80000;
 588  588  int     l2cache_linesz = 0x40;
 589  589  int     l2cache_assoc = 1;
 590  590  
 591  591  static size_t   textrepl_min_gb = 10;
 592  592  
 593  593  /*
 594  594   * on 64 bit we use a predifined VA range for mapping devices in the kernel
 595  595   * on 32 bit the mappings are intermixed in the heap, so we use a bit map
 596  596   */
 597  597  #ifdef __amd64
 598  598  
 599  599  vmem_t          *device_arena;
 600  600  uintptr_t       toxic_addr = (uintptr_t)NULL;
 601  601  size_t          toxic_size = 1024 * 1024 * 1024; /* Sparc uses 1 gig too */
 602  602  
 603  603  #else   /* __i386 */
 604  604  
 605  605  ulong_t         *toxic_bit_map; /* one bit for each 4k of VA in heap_arena */
 606  606  size_t          toxic_bit_map_len = 0;  /* in bits */
 607  607  
 608  608  #endif  /* __i386 */
 609  609  
 610  610  /*
 611  611   * Simple boot time debug facilities
 612  612   */
 613  613  static char *prm_dbg_str[] = {
 614  614          "%s:%d: '%s' is 0x%x\n",
 615  615          "%s:%d: '%s' is 0x%llx\n"
 616  616  };
 617  617  
 618  618  int prom_debug;
 619  619  
 620  620  #define PRM_DEBUG(q)    if (prom_debug)         \
 621  621          prom_printf(prm_dbg_str[sizeof (q) >> 3], "startup.c", __LINE__, #q, q);
 622  622  #define PRM_POINT(q)    if (prom_debug)         \
 623  623          prom_printf("%s:%d: %s\n", "startup.c", __LINE__, q);
 624  624  
 625  625  /*
 626  626   * This structure is used to keep track of the intial allocations
 627  627   * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to
 628  628   * be >= the number of ADD_TO_ALLOCATIONS() executed in the code.
 629  629   */
 630  630  #define NUM_ALLOCATIONS 8
 631  631  int num_allocations = 0;
 632  632  struct {
 633  633          void **al_ptr;
 634  634          size_t al_size;
 635  635  } allocations[NUM_ALLOCATIONS];
 636  636  size_t valloc_sz = 0;
 637  637  uintptr_t valloc_base;
 638  638  
 639  639  #define ADD_TO_ALLOCATIONS(ptr, size) {                                 \
 640  640                  size = ROUND_UP_PAGE(size);                             \
 641  641                  if (num_allocations == NUM_ALLOCATIONS)                 \
 642  642                          panic("too many ADD_TO_ALLOCATIONS()");         \
 643  643                  allocations[num_allocations].al_ptr = (void**)&ptr;     \
 644  644                  allocations[num_allocations].al_size = size;            \
 645  645                  valloc_sz += size;                                      \
 646  646                  ++num_allocations;                                      \
 647  647          }
 648  648  
 649  649  /*
 650  650   * Allocate all the initial memory needed by the page allocator.
 651  651   */
 652  652  static void
 653  653  perform_allocations(void)
 654  654  {
 655  655          caddr_t mem;
 656  656          int i;
 657  657          int valloc_align;
 658  658  
 659  659          PRM_DEBUG(valloc_base);
 660  660          PRM_DEBUG(valloc_sz);
 661  661          valloc_align = mmu.level_size[mmu.max_page_level > 0];
 662  662          mem = BOP_ALLOC(bootops, (caddr_t)valloc_base, valloc_sz, valloc_align);
 663  663          if (mem != (caddr_t)valloc_base)
 664  664                  panic("BOP_ALLOC() failed");
 665  665          bzero(mem, valloc_sz);
 666  666          for (i = 0; i < num_allocations; ++i) {
 667  667                  *allocations[i].al_ptr = (void *)mem;
 668  668                  mem += allocations[i].al_size;
 669  669          }
 670  670  }
 671  671  
 672  672  /*
 673  673   * Our world looks like this at startup time.
 674  674   *
 675  675   * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data
 676  676   * at 0xfec00000.  On a 64-bit OS, kernel text and data are loaded at
 677  677   * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively.  Those
 678  678   * addresses are fixed in the binary at link time.
 679  679   *
 680  680   * On the text page:
 681  681   * unix/genunix/krtld/module text loads.
 682  682   *
 683  683   * On the data page:
 684  684   * unix/genunix/krtld/module data loads.
 685  685   *
 686  686   * Machine-dependent startup code
 687  687   */
 688  688  void
 689  689  startup(void)
 690  690  {
 691  691  #if !defined(__xpv)
 692  692          extern void startup_pci_bios(void);
 693  693  #endif
 694  694          extern cpuset_t cpu_ready_set;
 695  695  
 696  696          /*
 697  697           * Make sure that nobody tries to use sekpm until we have
 698  698           * initialized it properly.
 699  699           */
 700  700  #if defined(__amd64)
 701  701          kpm_desired = 1;
 702  702  #endif
 703  703          kpm_enable = 0;
 704  704          CPUSET_ONLY(cpu_ready_set, 0);  /* cpu 0 is boot cpu */
 705  705  
 706  706  #if defined(__xpv)      /* XXPV fix me! */
 707  707          {
 708  708                  extern int segvn_use_regions;
 709  709                  segvn_use_regions = 0;
 710  710          }
 711  711  #endif
 712  712          progressbar_init();
 713  713          startup_init();
 714  714  #if defined(__xpv)
 715  715          startup_xen_version();
 716  716  #endif
 717  717          startup_memlist();
 718  718          startup_kmem();
 719  719          startup_vm();
 720  720  #if !defined(__xpv)
 721  721          /*
 722  722           * Note we need to do this even on fast reboot in order to access
 723  723           * the irq routing table (used for pci labels).
 724  724           */
 725  725          startup_pci_bios();
 726  726  #endif
 727  727  #if defined(__xpv)
 728  728          startup_xen_mca();
 729  729  #endif
 730  730          startup_modules();
 731  731  
 732  732          startup_end();
 733  733  }
 734  734  
 735  735  static void
 736  736  startup_init()
 737  737  {
 738  738          PRM_POINT("startup_init() starting...");
 739  739  
 740  740          /*
 741  741           * Complete the extraction of cpuid data
 742  742           */
 743  743          cpuid_pass2(CPU);
 744  744  
 745  745          (void) check_boot_version(BOP_GETVERSION(bootops));
 746  746  
 747  747          /*
 748  748           * Check for prom_debug in boot environment
 749  749           */
 750  750          if (BOP_GETPROPLEN(bootops, "prom_debug") >= 0) {
 751  751                  ++prom_debug;
 752  752                  PRM_POINT("prom_debug found in boot enviroment");
 753  753          }
 754  754  
 755  755          /*
 756  756           * Collect node, cpu and memory configuration information.
 757  757           */
 758  758          get_system_configuration();
 759  759  
 760  760          /*
 761  761           * Halt if this is an unsupported processor.
 762  762           */
 763  763          if (x86_type == X86_TYPE_486 || x86_type == X86_TYPE_CYRIX_486) {
 764  764                  printf("\n486 processor (\"%s\") detected.\n",
 765  765                      CPU->cpu_brandstr);
 766  766                  halt("This processor is not supported by this release "
 767  767                      "of Solaris.");
 768  768          }
 769  769  
 770  770          PRM_POINT("startup_init() done");
 771  771  }
 772  772  
 773  773  /*
 774  774   * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie.
 775  775   * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it
 776  776   * also filters out physical page zero.  There is some reliance on the
 777  777   * boot loader allocating only a few contiguous physical memory chunks.
 778  778   */
 779  779  static void
 780  780  avail_filter(uint64_t *addr, uint64_t *size)
 781  781  {
 782  782          uintptr_t va;
 783  783          uintptr_t next_va;
 784  784          pfn_t pfn;
 785  785          uint64_t pfn_addr;
 786  786          uint64_t pfn_eaddr;
 787  787          uint_t prot;
 788  788          size_t len;
 789  789          uint_t change;
 790  790  
 791  791          if (prom_debug)
 792  792                  prom_printf("\tFilter: in: a=%" PRIx64 ", s=%" PRIx64 "\n",
 793  793                      *addr, *size);
 794  794  
 795  795          /*
 796  796           * page zero is required for BIOS.. never make it available
 797  797           */
 798  798          if (*addr == 0) {
 799  799                  *addr += MMU_PAGESIZE;
 800  800                  *size -= MMU_PAGESIZE;
 801  801          }
 802  802  
 803  803          /*
 804  804           * First we trim from the front of the range. Since kbm_probe()
 805  805           * walks ranges in virtual order, but addr/size are physical, we need
 806  806           * to the list until no changes are seen.  This deals with the case
 807  807           * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w
 808  808           * but w < v.
 809  809           */
 810  810          do {
 811  811                  change = 0;
 812  812                  for (va = KERNEL_TEXT;
 813  813                      *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
 814  814                      va = next_va) {
 815  815  
 816  816                          next_va = va + len;
 817  817                          pfn_addr = pfn_to_pa(pfn);
 818  818                          pfn_eaddr = pfn_addr + len;
 819  819  
 820  820                          if (pfn_addr <= *addr && pfn_eaddr > *addr) {
 821  821                                  change = 1;
 822  822                                  while (*size > 0 && len > 0) {
 823  823                                          *addr += MMU_PAGESIZE;
 824  824                                          *size -= MMU_PAGESIZE;
 825  825                                          len -= MMU_PAGESIZE;
 826  826                                  }
 827  827                          }
 828  828                  }
 829  829                  if (change && prom_debug)
 830  830                          prom_printf("\t\ttrim: a=%" PRIx64 ", s=%" PRIx64 "\n",
 831  831                              *addr, *size);
 832  832          } while (change);
 833  833  
 834  834          /*
 835  835           * Trim pages from the end of the range.
 836  836           */
 837  837          for (va = KERNEL_TEXT;
 838  838              *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
 839  839              va = next_va) {
 840  840  
 841  841                  next_va = va + len;
 842  842                  pfn_addr = pfn_to_pa(pfn);
 843  843  
 844  844                  if (pfn_addr >= *addr && pfn_addr < *addr + *size)
 845  845                          *size = pfn_addr - *addr;
 846  846          }
 847  847  
 848  848          if (prom_debug)
 849  849                  prom_printf("\tFilter out: a=%" PRIx64 ", s=%" PRIx64 "\n",
 850  850                      *addr, *size);
 851  851  }
 852  852  
 853  853  static void
 854  854  kpm_init()
 855  855  {
 856  856          struct segkpm_crargs b;
 857  857  
 858  858          /*
 859  859           * These variables were all designed for sfmmu in which segkpm is
 860  860           * mapped using a single pagesize - either 8KB or 4MB.  On x86, we
 861  861           * might use 2+ page sizes on a single machine, so none of these
 862  862           * variables have a single correct value.  They are set up as if we
 863  863           * always use a 4KB pagesize, which should do no harm.  In the long
 864  864           * run, we should get rid of KPM's assumption that only a single
 865  865           * pagesize is used.
 866  866           */
 867  867          kpm_pgshft = MMU_PAGESHIFT;
 868  868          kpm_pgsz =  MMU_PAGESIZE;
 869  869          kpm_pgoff = MMU_PAGEOFFSET;
 870  870          kpmp2pshft = 0;
 871  871          kpmpnpgs = 1;
 872  872          ASSERT(((uintptr_t)kpm_vbase & (kpm_pgsz - 1)) == 0);
 873  873  
 874  874          PRM_POINT("about to create segkpm");
 875  875          rw_enter(&kas.a_lock, RW_WRITER);
 876  876  
 877  877          if (seg_attach(&kas, kpm_vbase, kpm_size, segkpm) < 0)
 878  878                  panic("cannot attach segkpm");
 879  879  
 880  880          b.prot = PROT_READ | PROT_WRITE;
 881  881          b.nvcolors = 1;
 882  882  
 883  883          if (segkpm_create(segkpm, (caddr_t)&b) != 0)
 884  884                  panic("segkpm_create segkpm");
 885  885  
 886  886          rw_exit(&kas.a_lock);
 887  887  }
 888  888  
 889  889  /*
 890  890   * The debug info page provides enough information to allow external
 891  891   * inspectors (e.g. when running under a hypervisor) to bootstrap
 892  892   * themselves into allowing full-blown kernel debugging.
 893  893   */
 894  894  static void
 895  895  init_debug_info(void)
 896  896  {
 897  897          caddr_t mem;
 898  898          debug_info_t *di;
 899  899  
 900  900  #ifndef __lint
 901  901          ASSERT(sizeof (debug_info_t) < MMU_PAGESIZE);
 902  902  #endif
 903  903  
 904  904          mem = BOP_ALLOC(bootops, (caddr_t)DEBUG_INFO_VA, MMU_PAGESIZE,
 905  905              MMU_PAGESIZE);
 906  906  
 907  907          if (mem != (caddr_t)DEBUG_INFO_VA)
 908  908                  panic("BOP_ALLOC() failed");
 909  909          bzero(mem, MMU_PAGESIZE);
 910  910  
 911  911          di = (debug_info_t *)mem;
 912  912  
 913  913          di->di_magic = DEBUG_INFO_MAGIC;
 914  914          di->di_version = DEBUG_INFO_VERSION;
 915  915          di->di_modules = (uintptr_t)&modules;
 916  916          di->di_s_text = (uintptr_t)s_text;
 917  917          di->di_e_text = (uintptr_t)e_text;
 918  918          di->di_s_data = (uintptr_t)s_data;
 919  919          di->di_e_data = (uintptr_t)e_data;
 920  920          di->di_hat_htable_off = offsetof(hat_t, hat_htable);
 921  921          di->di_ht_pfn_off = offsetof(htable_t, ht_pfn);
 922  922  }
 923  923  
 924  924  /*
 925  925   * Build the memlists and other kernel essential memory system data structures.
 926  926   * This is everything at valloc_base.
 927  927   */
 928  928  static void
 929  929  startup_memlist(void)
 930  930  {
 931  931          size_t memlist_sz;
 932  932          size_t memseg_sz;
 933  933          size_t pagehash_sz;
 934  934          size_t pp_sz;
 935  935          uintptr_t va;
 936  936          size_t len;
 937  937          uint_t prot;
 938  938          pfn_t pfn;
 939  939          int memblocks;
 940  940          pfn_t rsvd_high_pfn;
 941  941          pgcnt_t rsvd_pgcnt;
 942  942          size_t rsvdmemlist_sz;
 943  943          int rsvdmemblocks;
 944  944          caddr_t pagecolor_mem;
 945  945          size_t pagecolor_memsz;
 946  946          caddr_t page_ctrs_mem;
 947  947          size_t page_ctrs_size;
 948  948          size_t pse_table_alloc_size;
 949  949          struct memlist *current;
 950  950          extern void startup_build_mem_nodes(struct memlist *);
 951  951  
 952  952          /* XX64 fix these - they should be in include files */
 953  953          extern size_t page_coloring_init(uint_t, int, int);
 954  954          extern void page_coloring_setup(caddr_t);
 955  955  
 956  956          PRM_POINT("startup_memlist() starting...");
 957  957  
 958  958          /*
 959  959           * Use leftover large page nucleus text/data space for loadable modules.
 960  960           * Use at most MODTEXT/MODDATA.
 961  961           */
 962  962          len = kbm_nucleus_size;
 963  963          ASSERT(len > MMU_PAGESIZE);
 964  964  
 965  965          moddata = (caddr_t)ROUND_UP_PAGE(e_data);
 966  966          e_moddata = (caddr_t)P2ROUNDUP((uintptr_t)e_data, (uintptr_t)len);
 967  967          if (e_moddata - moddata > MODDATA)
 968  968                  e_moddata = moddata + MODDATA;
 969  969  
 970  970          modtext = (caddr_t)ROUND_UP_PAGE(e_text);
 971  971          e_modtext = (caddr_t)P2ROUNDUP((uintptr_t)e_text, (uintptr_t)len);
 972  972          if (e_modtext - modtext > MODTEXT)
 973  973                  e_modtext = modtext + MODTEXT;
 974  974  
 975  975          econtig = e_moddata;
 976  976  
 977  977          PRM_DEBUG(modtext);
 978  978          PRM_DEBUG(e_modtext);
 979  979          PRM_DEBUG(moddata);
 980  980          PRM_DEBUG(e_moddata);
 981  981          PRM_DEBUG(econtig);
 982  982  
 983  983          /*
 984  984           * Examine the boot loader physical memory map to find out:
 985  985           * - total memory in system - physinstalled
 986  986           * - the max physical address - physmax
 987  987           * - the number of discontiguous segments of memory.
 988  988           */
 989  989          if (prom_debug)
 990  990                  print_memlist("boot physinstalled",
 991  991                      bootops->boot_mem->physinstalled);
 992  992          installed_top_size_ex(bootops->boot_mem->physinstalled, &physmax,
 993  993              &physinstalled, &memblocks);
 994  994          PRM_DEBUG(physmax);
 995  995          PRM_DEBUG(physinstalled);
 996  996          PRM_DEBUG(memblocks);
 997  997  
 998  998          /*
 999  999           * Compute maximum physical address for memory DR operations.
1000 1000           * Memory DR operations are unsupported on xpv or 32bit OSes.
1001 1001           */
1002 1002  #ifdef  __amd64
1003 1003          if (plat_dr_support_memory()) {
1004 1004                  if (plat_dr_physmax == 0) {
1005 1005                          uint_t pabits = UINT_MAX;
1006 1006  
1007 1007                          cpuid_get_addrsize(CPU, &pabits, NULL);
1008 1008                          plat_dr_physmax = btop(1ULL << pabits);
1009 1009                  }
1010 1010                  if (plat_dr_physmax > PHYSMEM_MAX64)
1011 1011                          plat_dr_physmax = PHYSMEM_MAX64;
1012 1012          } else
1013 1013  #endif
1014 1014                  plat_dr_physmax = 0;
1015 1015  
1016 1016          /*
1017 1017           * Examine the bios reserved memory to find out:
1018 1018           * - the number of discontiguous segments of memory.
1019 1019           */
1020 1020          if (prom_debug)
1021 1021                  print_memlist("boot reserved mem",
1022 1022                      bootops->boot_mem->rsvdmem);
1023 1023          installed_top_size_ex(bootops->boot_mem->rsvdmem, &rsvd_high_pfn,
1024 1024              &rsvd_pgcnt, &rsvdmemblocks);
1025 1025          PRM_DEBUG(rsvd_high_pfn);
1026 1026          PRM_DEBUG(rsvd_pgcnt);
1027 1027          PRM_DEBUG(rsvdmemblocks);
1028 1028  
1029 1029          /*
1030 1030           * Initialize hat's mmu parameters.
1031 1031           * Check for enforce-prot-exec in boot environment. It's used to
1032 1032           * enable/disable support for the page table entry NX bit.
1033 1033           * The default is to enforce PROT_EXEC on processors that support NX.
1034 1034           * Boot seems to round up the "len", but 8 seems to be big enough.
1035 1035           */
1036 1036          mmu_init();
1037 1037  
1038 1038  #ifdef  __i386
1039 1039          /*
1040 1040           * physmax is lowered if there is more memory than can be
1041 1041           * physically addressed in 32 bit (PAE/non-PAE) modes.
1042 1042           */
1043 1043          if (mmu.pae_hat) {
1044 1044                  if (PFN_ABOVE64G(physmax)) {
1045 1045                          physinstalled -= (physmax - (PFN_64G - 1));
1046 1046                          physmax = PFN_64G - 1;
1047 1047                  }
1048 1048          } else {
1049 1049                  if (PFN_ABOVE4G(physmax)) {
1050 1050                          physinstalled -= (physmax - (PFN_4G - 1));
1051 1051                          physmax = PFN_4G - 1;
1052 1052                  }
1053 1053          }
1054 1054  #endif
1055 1055  
1056 1056          startup_build_mem_nodes(bootops->boot_mem->physinstalled);
1057 1057  
1058 1058          if (BOP_GETPROPLEN(bootops, "enforce-prot-exec") >= 0) {
1059 1059                  int len = BOP_GETPROPLEN(bootops, "enforce-prot-exec");
1060 1060                  char value[8];
1061 1061  
1062 1062                  if (len < 8)
1063 1063                          (void) BOP_GETPROP(bootops, "enforce-prot-exec", value);
1064 1064                  else
1065 1065                          (void) strcpy(value, "");
1066 1066                  if (strcmp(value, "off") == 0)
1067 1067                          mmu.pt_nx = 0;
1068 1068          }
1069 1069          PRM_DEBUG(mmu.pt_nx);
1070 1070  
1071 1071          /*
1072 1072           * We will need page_t's for every page in the system, except for
1073 1073           * memory mapped at or above above the start of the kernel text segment.
1074 1074           *
1075 1075           * pages above e_modtext are attributed to kernel debugger (obp_pages)
1076 1076           */
1077 1077          npages = physinstalled - 1; /* avail_filter() skips page 0, so "- 1" */
1078 1078          obp_pages = 0;
1079 1079          va = KERNEL_TEXT;
1080 1080          while (kbm_probe(&va, &len, &pfn, &prot) != 0) {
1081 1081                  npages -= len >> MMU_PAGESHIFT;
1082 1082                  if (va >= (uintptr_t)e_moddata)
1083 1083                          obp_pages += len >> MMU_PAGESHIFT;
1084 1084                  va += len;
1085 1085          }
1086 1086          PRM_DEBUG(npages);
1087 1087          PRM_DEBUG(obp_pages);
1088 1088  
1089 1089          /*
1090 1090           * If physmem is patched to be non-zero, use it instead of the computed
1091 1091           * value unless it is larger than the actual amount of memory on hand.
1092 1092           */
1093 1093          if (physmem == 0 || physmem > npages) {
1094 1094                  physmem = npages;
1095 1095          } else if (physmem < npages) {
1096 1096                  orig_npages = npages;
1097 1097                  npages = physmem;
1098 1098          }
1099 1099          PRM_DEBUG(physmem);
1100 1100  
1101 1101          /*
1102 1102           * We now compute the sizes of all the  initial allocations for
1103 1103           * structures the kernel needs in order do kmem_alloc(). These
1104 1104           * include:
1105 1105           *      memsegs
1106 1106           *      memlists
1107 1107           *      page hash table
1108 1108           *      page_t's
1109 1109           *      page coloring data structs
1110 1110           */
1111 1111          memseg_sz = sizeof (struct memseg) * (memblocks + POSS_NEW_FRAGMENTS);
1112 1112          ADD_TO_ALLOCATIONS(memseg_base, memseg_sz);
1113 1113          PRM_DEBUG(memseg_sz);
1114 1114  
1115 1115          /*
1116 1116           * Reserve space for memlists. There's no real good way to know exactly
1117 1117           * how much room we'll need, but this should be a good upper bound.
1118 1118           */
1119 1119          memlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1120 1120              (memblocks + POSS_NEW_FRAGMENTS));
1121 1121          ADD_TO_ALLOCATIONS(memlist, memlist_sz);
1122 1122          PRM_DEBUG(memlist_sz);
1123 1123  
1124 1124          /*
1125 1125           * Reserve space for bios reserved memlists.
1126 1126           */
1127 1127          rsvdmemlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1128 1128              (rsvdmemblocks + POSS_NEW_FRAGMENTS));
1129 1129          ADD_TO_ALLOCATIONS(bios_rsvd, rsvdmemlist_sz);
1130 1130          PRM_DEBUG(rsvdmemlist_sz);
1131 1131  
1132 1132          /* LINTED */
1133 1133          ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), sizeof (struct page)));
1134 1134          /*
1135 1135           * The page structure hash table size is a power of 2
1136 1136           * such that the average hash chain length is PAGE_HASHAVELEN.
1137 1137           */
1138 1138          page_hashsz = npages / PAGE_HASHAVELEN;
1139 1139          page_hashsz_shift = highbit(page_hashsz);
1140 1140          page_hashsz = 1 << page_hashsz_shift;
1141 1141          pagehash_sz = sizeof (struct page *) * page_hashsz;
1142 1142          ADD_TO_ALLOCATIONS(page_hash, pagehash_sz);
1143 1143          PRM_DEBUG(pagehash_sz);
1144 1144  
1145 1145          /*
1146 1146           * Set aside room for the page structures themselves.
1147 1147           */
1148 1148          PRM_DEBUG(npages);
1149 1149          pp_sz = sizeof (struct page) * npages;
1150 1150          ADD_TO_ALLOCATIONS(pp_base, pp_sz);
1151 1151          PRM_DEBUG(pp_sz);
1152 1152  
1153 1153          /*
1154 1154           * determine l2 cache info and memory size for page coloring
1155 1155           */
1156 1156          (void) getl2cacheinfo(CPU,
1157 1157              &l2cache_sz, &l2cache_linesz, &l2cache_assoc);
1158 1158          pagecolor_memsz =
1159 1159              page_coloring_init(l2cache_sz, l2cache_linesz, l2cache_assoc);
1160 1160          ADD_TO_ALLOCATIONS(pagecolor_mem, pagecolor_memsz);
1161 1161          PRM_DEBUG(pagecolor_memsz);
1162 1162  
1163 1163          page_ctrs_size = page_ctrs_sz();
1164 1164          ADD_TO_ALLOCATIONS(page_ctrs_mem, page_ctrs_size);
1165 1165          PRM_DEBUG(page_ctrs_size);
1166 1166  
1167 1167          /*
1168 1168           * Allocate the array that protects pp->p_selock.
1169 1169           */
1170 1170          pse_shift = size_pse_array(physmem, max_ncpus);
1171 1171          pse_table_size = 1 << pse_shift;
1172 1172          pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
1173 1173          ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
1174 1174  
1175 1175  #if defined(__amd64)
1176 1176          valloc_sz = ROUND_UP_LPAGE(valloc_sz);
1177 1177          valloc_base = VALLOC_BASE;
1178 1178  
1179 1179          /*
1180 1180           * The default values of VALLOC_BASE and SEGKPM_BASE should work
1181 1181           * for values of physmax up to 1 Terabyte. They need adjusting when
1182 1182           * memory is at addresses above 1 TB. When adjusted, segkpm_base must
1183 1183           * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
1184 1184           */
1185 1185          if (physmax + 1 > mmu_btop(TERABYTE) ||
1186 1186              plat_dr_physmax > mmu_btop(TERABYTE)) {
1187 1187                  uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
1188 1188  
1189 1189                  if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
1190 1190                          kpm_resv_amount = mmu_ptob(plat_dr_physmax);
1191 1191                  }
1192 1192  
1193 1193                  segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount),
1194 1194                      KERNEL_REDZONE_SIZE));      /* down from top VA */
1195 1195  
1196 1196                  /* make sure we leave some space for user apps above hole */
1197 1197                  segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
1198 1198                  if (segkpm_base > SEGKPM_BASE)
1199 1199                          segkpm_base = SEGKPM_BASE;
1200 1200                  PRM_DEBUG(segkpm_base);
1201 1201  
1202 1202                  valloc_base = segkpm_base + P2ROUNDUP(kpm_resv_amount, ONE_GIG);
1203 1203                  if (valloc_base < segkpm_base)
1204 1204                          panic("not enough kernel VA to support memory size");
1205 1205                  PRM_DEBUG(valloc_base);
1206 1206          }
1207 1207  #else   /* __i386 */
1208 1208          valloc_base = (uintptr_t)(MISC_VA_BASE - valloc_sz);
1209 1209          valloc_base = P2ALIGN(valloc_base, mmu.level_size[1]);
1210 1210          PRM_DEBUG(valloc_base);
1211 1211  #endif  /* __i386 */
1212 1212  
1213 1213          /*
1214 1214           * do all the initial allocations
1215 1215           */
1216 1216          perform_allocations();
1217 1217  
1218 1218          /*
1219 1219           * Build phys_install and phys_avail in kernel memspace.
1220 1220           * - phys_install should be all memory in the system.
1221 1221           * - phys_avail is phys_install minus any memory mapped before this
1222 1222           *    point above KERNEL_TEXT.
1223 1223           */
1224 1224          current = phys_install = memlist;
1225 1225          copy_memlist_filter(bootops->boot_mem->physinstalled, &current, NULL);
1226 1226          if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1227 1227                  panic("physinstalled was too big!");
1228 1228          if (prom_debug)
1229 1229                  print_memlist("phys_install", phys_install);
1230 1230  
1231 1231          phys_avail = current;
1232 1232          PRM_POINT("Building phys_avail:\n");
1233 1233          copy_memlist_filter(bootops->boot_mem->physinstalled, &current,
1234 1234              avail_filter);
1235 1235          if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1236 1236                  panic("physavail was too big!");
1237 1237          if (prom_debug)
1238 1238                  print_memlist("phys_avail", phys_avail);
1239 1239  #ifndef __xpv
1240 1240          /*
1241 1241           * Free unused memlist items, which may be used by memory DR driver
1242 1242           * at runtime.
1243 1243           */
1244 1244          if ((caddr_t)current < (caddr_t)memlist + memlist_sz) {
1245 1245                  memlist_free_block((caddr_t)current,
1246 1246                      (caddr_t)memlist + memlist_sz - (caddr_t)current);
1247 1247          }
1248 1248  #endif
1249 1249  
1250 1250          /*
1251 1251           * Build bios reserved memspace
1252 1252           */
1253 1253          current = bios_rsvd;
1254 1254          copy_memlist_filter(bootops->boot_mem->rsvdmem, &current, NULL);
1255 1255          if ((caddr_t)current > (caddr_t)bios_rsvd + rsvdmemlist_sz)
1256 1256                  panic("bios_rsvd was too big!");
1257 1257          if (prom_debug)
1258 1258                  print_memlist("bios_rsvd", bios_rsvd);
1259 1259  #ifndef __xpv
1260 1260          /*
1261 1261           * Free unused memlist items, which may be used by memory DR driver
1262 1262           * at runtime.
1263 1263           */
1264 1264          if ((caddr_t)current < (caddr_t)bios_rsvd + rsvdmemlist_sz) {
1265 1265                  memlist_free_block((caddr_t)current,
1266 1266                      (caddr_t)bios_rsvd + rsvdmemlist_sz - (caddr_t)current);
1267 1267          }
1268 1268  #endif
1269 1269  
1270 1270          /*
1271 1271           * setup page coloring
1272 1272           */
1273 1273          page_coloring_setup(pagecolor_mem);
1274 1274          page_lock_init();       /* currently a no-op */
1275 1275  
1276 1276          /*
1277 1277           * free page list counters
1278 1278           */
1279 1279          (void) page_ctrs_alloc(page_ctrs_mem);
1280 1280  
1281 1281          /*
1282 1282           * Size the pcf array based on the number of cpus in the box at
1283 1283           * boot time.
1284 1284           */
1285 1285  
1286 1286          pcf_init();
1287 1287  
1288 1288          /*
1289 1289           * Initialize the page structures from the memory lists.
1290 1290           */
1291 1291          availrmem_initial = availrmem = freemem = 0;
1292 1292          PRM_POINT("Calling kphysm_init()...");
1293 1293          npages = kphysm_init(pp_base, npages);
1294 1294          PRM_POINT("kphysm_init() done");
1295 1295          PRM_DEBUG(npages);
1296 1296  
1297 1297          init_debug_info();
1298 1298  
1299 1299          /*
1300 1300           * Now that page_t's have been initialized, remove all the
1301 1301           * initial allocation pages from the kernel free page lists.
1302 1302           */
1303 1303          boot_mapin((caddr_t)valloc_base, valloc_sz);
1304 1304          boot_mapin((caddr_t)MISC_VA_BASE, MISC_VA_SIZE);
1305 1305          PRM_POINT("startup_memlist() done");
1306 1306  
1307 1307          PRM_DEBUG(valloc_sz);
1308 1308  
1309 1309  #if defined(__amd64)
1310 1310          if ((availrmem >> (30 - MMU_PAGESHIFT)) >=
1311 1311              textrepl_min_gb && l2cache_sz <= 2 << 20) {
1312 1312                  extern size_t textrepl_size_thresh;
1313 1313                  textrepl_size_thresh = (16 << 20) - 1;
1314 1314          }
1315 1315  #endif
1316 1316  }
1317 1317  
1318 1318  /*
1319 1319   * Layout the kernel's part of address space and initialize kmem allocator.
1320 1320   */
1321 1321  static void
1322 1322  startup_kmem(void)
1323 1323  {
1324 1324          extern void page_set_colorequiv_arr(void);
1325 1325  
1326 1326          PRM_POINT("startup_kmem() starting...");
1327 1327  
1328 1328  #if defined(__amd64)
1329 1329          if (eprom_kernelbase && eprom_kernelbase != KERNELBASE)
1330 1330                  cmn_err(CE_NOTE, "!kernelbase cannot be changed on 64-bit "
1331 1331                      "systems.");
1332 1332          kernelbase = segkpm_base - KERNEL_REDZONE_SIZE;
1333 1333          core_base = (uintptr_t)COREHEAP_BASE;
1334 1334          core_size = (size_t)MISC_VA_BASE - COREHEAP_BASE;
1335 1335  #else   /* __i386 */
1336 1336          /*
1337 1337           * We configure kernelbase based on:
1338 1338           *
1339 1339           * 1. user specified kernelbase via eeprom command. Value cannot exceed
1340 1340           *    KERNELBASE_MAX. we large page align eprom_kernelbase
1341 1341           *
1342 1342           * 2. Default to KERNELBASE and adjust to 2X less the size for page_t.
1343 1343           *    On large memory systems we must lower kernelbase to allow
1344 1344           *    enough room for page_t's for all of memory.
1345 1345           *
1346 1346           * The value set here, might be changed a little later.
1347 1347           */
1348 1348          if (eprom_kernelbase) {
1349 1349                  kernelbase = eprom_kernelbase & mmu.level_mask[1];
1350 1350                  if (kernelbase > KERNELBASE_MAX)
1351 1351                          kernelbase = KERNELBASE_MAX;
1352 1352          } else {
1353 1353                  kernelbase = (uintptr_t)KERNELBASE;
1354 1354                  kernelbase -= ROUND_UP_4MEG(2 * valloc_sz);
1355 1355          }
1356 1356          ASSERT((kernelbase & mmu.level_offset[1]) == 0);
1357 1357          core_base = valloc_base;
1358 1358          core_size = 0;
1359 1359  #endif  /* __i386 */
1360 1360  
1361 1361          PRM_DEBUG(core_base);
1362 1362          PRM_DEBUG(core_size);
1363 1363          PRM_DEBUG(kernelbase);
1364 1364  
1365 1365  #if defined(__i386)
1366 1366          segkp_fromheap = 1;
1367 1367  #endif  /* __i386 */
1368 1368  
1369 1369          ekernelheap = (char *)core_base;
1370 1370          PRM_DEBUG(ekernelheap);
1371 1371  
1372 1372          /*
1373 1373           * Now that we know the real value of kernelbase,
1374 1374           * update variables that were initialized with a value of
1375 1375           * KERNELBASE (in common/conf/param.c).
1376 1376           *
1377 1377           * XXX  The problem with this sort of hackery is that the
1378 1378           *      compiler just may feel like putting the const declarations
1379 1379           *      (in param.c) into the .text section.  Perhaps they should
1380 1380           *      just be declared as variables there?
1381 1381           */
1382 1382  
1383 1383          *(uintptr_t *)&_kernelbase = kernelbase;
1384 1384          *(uintptr_t *)&_userlimit = kernelbase;
1385 1385  #if defined(__amd64)
1386 1386          *(uintptr_t *)&_userlimit -= KERNELBASE - USERLIMIT;
1387 1387  #else
1388 1388          *(uintptr_t *)&_userlimit32 = _userlimit;
1389 1389  #endif
1390 1390          PRM_DEBUG(_kernelbase);
1391 1391          PRM_DEBUG(_userlimit);
1392 1392          PRM_DEBUG(_userlimit32);
1393 1393  
1394 1394          layout_kernel_va();
1395 1395  
1396 1396  #if defined(__i386)
1397 1397          /*
1398 1398           * If segmap is too large we can push the bottom of the kernel heap
1399 1399           * higher than the base.  Or worse, it could exceed the top of the
1400 1400           * VA space entirely, causing it to wrap around.
1401 1401           */
1402 1402          if (kernelheap >= ekernelheap || (uintptr_t)kernelheap < kernelbase)
1403 1403                  panic("too little address space available for kernelheap,"
1404 1404                      " use eeprom for lower kernelbase or smaller segmapsize");
1405 1405  #endif  /* __i386 */
1406 1406  
1407 1407          /*
1408 1408           * Initialize the kernel heap. Note 3rd argument must be > 1st.
1409 1409           */
1410 1410          kernelheap_init(kernelheap, ekernelheap,
1411 1411              kernelheap + MMU_PAGESIZE,
1412 1412              (void *)core_base, (void *)(core_base + core_size));
1413 1413  
1414 1414  #if defined(__xpv)
1415 1415          /*
1416 1416           * Link pending events struct into cpu struct
1417 1417           */
1418 1418          CPU->cpu_m.mcpu_evt_pend = &cpu0_evt_data;
1419 1419  #endif
1420 1420          /*
1421 1421           * Initialize kernel memory allocator.
1422 1422           */
1423 1423          kmem_init();
1424 1424  
1425 1425          /*
1426 1426           * Factor in colorequiv to check additional 'equivalent' bins
1427 1427           */
1428 1428          page_set_colorequiv_arr();
1429 1429  
1430 1430          /*
1431 1431           * print this out early so that we know what's going on
1432 1432           */
1433 1433          print_x86_featureset(x86_featureset);
1434 1434  
1435 1435          /*
1436 1436           * Initialize bp_mapin().
1437 1437           */
1438 1438          bp_init(MMU_PAGESIZE, HAT_STORECACHING_OK);
1439 1439  
1440 1440          /*
1441 1441           * orig_npages is non-zero if physmem has been configured for less
1442 1442           * than the available memory.
1443 1443           */
1444 1444          if (orig_npages) {
1445 1445                  cmn_err(CE_WARN, "!%slimiting physmem to 0x%lx of 0x%lx pages",
1446 1446                      (npages == PHYSMEM ? "Due to virtual address space " : ""),
1447 1447                      npages, orig_npages);
1448 1448          }
1449 1449  #if defined(__i386)
1450 1450          if (eprom_kernelbase && (eprom_kernelbase != kernelbase))
1451 1451                  cmn_err(CE_WARN, "kernelbase value, User specified 0x%lx, "
1452 1452                      "System using 0x%lx",
1453 1453                      (uintptr_t)eprom_kernelbase, (uintptr_t)kernelbase);
1454 1454  #endif
1455 1455  
1456 1456  #ifdef  KERNELBASE_ABI_MIN
1457 1457          if (kernelbase < (uintptr_t)KERNELBASE_ABI_MIN) {
1458 1458                  cmn_err(CE_NOTE, "!kernelbase set to 0x%lx, system is not "
1459 1459                      "i386 ABI compliant.", (uintptr_t)kernelbase);
1460 1460          }
1461 1461  #endif
1462 1462  
1463 1463  #ifndef __xpv
1464 1464          if (plat_dr_support_memory()) {
1465 1465                  mem_config_init();
1466 1466          }
1467 1467  #else   /* __xpv */
1468 1468          /*
1469 1469           * Some of the xen start information has to be relocated up
1470 1470           * into the kernel's permanent address space.
1471 1471           */
1472 1472          PRM_POINT("calling xen_relocate_start_info()");
1473 1473          xen_relocate_start_info();
1474 1474          PRM_POINT("xen_relocate_start_info() done");
1475 1475  
1476 1476          /*
1477 1477           * (Update the vcpu pointer in our cpu structure to point into
1478 1478           * the relocated shared info.)
1479 1479           */
1480 1480          CPU->cpu_m.mcpu_vcpu_info =
1481 1481              &HYPERVISOR_shared_info->vcpu_info[CPU->cpu_id];
1482 1482  #endif  /* __xpv */
1483 1483  
1484 1484          PRM_POINT("startup_kmem() done");
1485 1485  }
1486 1486  
1487 1487  #ifndef __xpv
1488 1488  /*
1489 1489   * If we have detected that we are running in an HVM environment, we need
1490 1490   * to prepend the PV driver directory to the module search path.
1491 1491   */
1492 1492  #define HVM_MOD_DIR "/platform/i86hvm/kernel"
1493 1493  static void
1494 1494  update_default_path()
1495 1495  {
1496 1496          char *current, *newpath;
1497 1497          int newlen;
1498 1498  
1499 1499          /*
1500 1500           * We are about to resync with krtld.  krtld will reset its
1501 1501           * internal module search path iff Solaris has set default_path.
1502 1502           * We want to be sure we're prepending this new directory to the
1503 1503           * right search path.
1504 1504           */
1505 1505          current = (default_path == NULL) ? kobj_module_path : default_path;
1506 1506  
1507 1507          newlen = strlen(HVM_MOD_DIR) + strlen(current) + 2;
1508 1508          newpath = kmem_alloc(newlen, KM_SLEEP);
1509 1509          (void) strcpy(newpath, HVM_MOD_DIR);
1510 1510          (void) strcat(newpath, " ");
1511 1511          (void) strcat(newpath, current);
1512 1512  
1513 1513          default_path = newpath;
1514 1514  }
1515 1515  #endif
1516 1516  
1517 1517  static void
1518 1518  startup_modules(void)
1519 1519  {
1520 1520          int cnt;
1521 1521          extern void prom_setup(void);
1522 1522          int32_t v, h;
1523 1523          char d[11];
1524 1524          char *cp;
1525 1525          cmi_hdl_t hdl;
1526 1526  
1527 1527          PRM_POINT("startup_modules() starting...");
1528 1528  
1529 1529  #ifndef __xpv
1530 1530          /*
1531 1531           * Initialize ten-micro second timer so that drivers will
1532 1532           * not get short changed in their init phase. This was
1533 1533           * not getting called until clkinit which, on fast cpu's
1534 1534           * caused the drv_usecwait to be way too short.
1535 1535           */
1536 1536          microfind();
1537 1537  
1538 1538          if ((get_hwenv() & HW_XEN_HVM) != 0)
1539 1539                  update_default_path();
1540 1540  #endif
1541 1541  
1542 1542          /*
1543 1543           * Read the GMT lag from /etc/rtc_config.
1544 1544           */
1545 1545          sgmtl(process_rtc_config_file());
1546 1546  
1547 1547          /*
1548 1548           * Calculate default settings of system parameters based upon
1549 1549           * maxusers, yet allow to be overridden via the /etc/system file.
1550 1550           */
1551 1551          param_calc(0);
1552 1552  
1553 1553          mod_setup();
1554 1554  
1555 1555          /*
1556 1556           * Initialize system parameters.
1557 1557           */
1558 1558          param_init();
1559 1559  
1560 1560          /*
1561 1561           * Initialize the default brands
1562 1562           */
1563 1563          brand_init();
1564 1564  
1565 1565          /*
1566 1566           * maxmem is the amount of physical memory we're playing with.
1567 1567           */
1568 1568          maxmem = physmem;
1569 1569  
1570 1570          /*
1571 1571           * Initialize segment management stuff.
1572 1572           */
1573 1573          seg_init();
1574 1574  
1575 1575          if (modload("fs", "specfs") == -1)
1576 1576                  halt("Can't load specfs");
1577 1577  
1578 1578          if (modload("fs", "devfs") == -1)
1579 1579                  halt("Can't load devfs");
1580 1580  
1581 1581          if (modload("fs", "dev") == -1)
1582 1582                  halt("Can't load dev");
1583 1583  
1584 1584          if (modload("fs", "procfs") == -1)
1585 1585                  halt("Can't load procfs");
1586 1586  
1587 1587          (void) modloadonly("sys", "lbl_edition");
1588 1588  
1589 1589          dispinit();
1590 1590  
1591 1591          /* Read cluster configuration data. */
1592 1592          clconf_init();
1593 1593  
1594 1594  #if defined(__xpv)
1595 1595          (void) ec_init();
1596 1596          gnttab_init();
1597 1597          (void) xs_early_init();
1598 1598  #endif /* __xpv */
1599 1599  
1600 1600          /*
1601 1601           * Create a kernel device tree. First, create rootnex and
1602 1602           * then invoke bus specific code to probe devices.
1603 1603           */
1604 1604          setup_ddi();
1605 1605  
1606 1606  #ifdef __xpv
1607 1607          if (DOMAIN_IS_INITDOMAIN(xen_info))
1608 1608  #endif
1609 1609          {
1610 1610                  /*
1611 1611                   * Load the System Management BIOS into the global ksmbios
1612 1612                   * handle, if an SMBIOS is present on this system.
1613 1613                   */
1614 1614                  ksmbios = smbios_open(NULL, SMB_VERSION, ksmbios_flags, NULL);
1615 1615          }
1616 1616  
1617 1617  
1618 1618          /*
1619 1619           * Originally clconf_init() apparently needed the hostid.  But
1620 1620           * this no longer appears to be true - it uses its own nodeid.
1621 1621           * By placing the hostid logic here, we are able to make use of
1622 1622           * the SMBIOS UUID.
1623 1623           */
1624 1624          if ((h = set_soft_hostid()) == HW_INVALID_HOSTID) {
1625 1625                  cmn_err(CE_WARN, "Unable to set hostid");
1626 1626          } else {
1627 1627                  for (v = h, cnt = 0; cnt < 10; cnt++) {
1628 1628                          d[cnt] = (char)(v % 10);
1629 1629                          v /= 10;
1630 1630                          if (v == 0)
1631 1631                                  break;
1632 1632                  }
1633 1633                  for (cp = hw_serial; cnt >= 0; cnt--)
1634 1634                          *cp++ = d[cnt] + '0';
1635 1635                  *cp = 0;
1636 1636          }
1637 1637  
1638 1638          /*
1639 1639           * Set up the CPU module subsystem for the boot cpu in the native
1640 1640           * case, and all physical cpu resource in the xpv dom0 case.
1641 1641           * Modifies the device tree, so this must be done after
1642 1642           * setup_ddi().
1643 1643           */
1644 1644  #ifdef __xpv
1645 1645          /*
1646 1646           * If paravirtualized and on dom0 then we initialize all physical
1647 1647           * cpu handles now;  if paravirtualized on a domU then do not
1648 1648           * initialize.
1649 1649           */
1650 1650          if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1651 1651                  xen_mc_lcpu_cookie_t cpi;
1652 1652  
1653 1653                  for (cpi = xen_physcpu_next(NULL); cpi != NULL;
1654 1654                      cpi = xen_physcpu_next(cpi)) {
1655 1655                          if ((hdl = cmi_init(CMI_HDL_SOLARIS_xVM_MCA,
1656 1656                              xen_physcpu_chipid(cpi), xen_physcpu_coreid(cpi),
1657 1657                              xen_physcpu_strandid(cpi))) != NULL &&
1658 1658                              is_x86_feature(x86_featureset, X86FSET_MCA))
1659 1659                                  cmi_mca_init(hdl);
1660 1660                  }
1661 1661          }
1662 1662  #else
1663 1663          /*
1664 1664           * Initialize a handle for the boot cpu - others will initialize
1665 1665           * as they startup.  Do not do this if we know we are in an HVM domU.
1666 1666           */
1667 1667          if ((get_hwenv() & HW_XEN_HVM) == 0 &&
1668 1668              (hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU),
1669 1669              cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL &&
1670 1670              is_x86_feature(x86_featureset, X86FSET_MCA)) {
1671 1671                          cmi_mca_init(hdl);
1672 1672                          CPU->cpu_m.mcpu_cmi_hdl = hdl;
1673 1673          }
1674 1674  #endif  /* __xpv */
1675 1675  
1676 1676          /*
1677 1677           * Fake a prom tree such that /dev/openprom continues to work
1678 1678           */
1679 1679          PRM_POINT("startup_modules: calling prom_setup...");
1680 1680          prom_setup();
1681 1681          PRM_POINT("startup_modules: done");
1682 1682  
1683 1683          /*
1684 1684           * Load all platform specific modules
1685 1685           */
1686 1686          PRM_POINT("startup_modules: calling psm_modload...");
1687 1687          psm_modload();
1688 1688  
1689 1689          PRM_POINT("startup_modules() done");
1690 1690  }
1691 1691  
1692 1692  /*
1693 1693   * claim a "setaside" boot page for use in the kernel
1694 1694   */
1695 1695  page_t *
1696 1696  boot_claim_page(pfn_t pfn)
1697 1697  {
1698 1698          page_t *pp;
1699 1699  
1700 1700          pp = page_numtopp_nolock(pfn);
1701 1701          ASSERT(pp != NULL);
1702 1702  
1703 1703          if (PP_ISBOOTPAGES(pp)) {
1704 1704                  if (pp->p_next != NULL)
1705 1705                          pp->p_next->p_prev = pp->p_prev;
1706 1706                  if (pp->p_prev == NULL)
1707 1707                          bootpages = pp->p_next;
1708 1708                  else
1709 1709                          pp->p_prev->p_next = pp->p_next;
1710 1710          } else {
1711 1711                  /*
1712 1712                   * htable_attach() expects a base pagesize page
1713 1713                   */
1714 1714                  if (pp->p_szc != 0)
1715 1715                          page_boot_demote(pp);
1716 1716                  pp = page_numtopp(pfn, SE_EXCL);
1717 1717          }
1718 1718          return (pp);
1719 1719  }
1720 1720  
1721 1721  /*
1722 1722   * Walk through the pagetables looking for pages mapped in by boot.  If the
1723 1723   * setaside flag is set the pages are expected to be returned to the
1724 1724   * kernel later in boot, so we add them to the bootpages list.
1725 1725   */
1726 1726  static void
1727 1727  protect_boot_range(uintptr_t low, uintptr_t high, int setaside)
1728 1728  {
1729 1729          uintptr_t va = low;
1730 1730          size_t len;
1731 1731          uint_t prot;
1732 1732          pfn_t pfn;
1733 1733          page_t *pp;
1734 1734          pgcnt_t boot_protect_cnt = 0;
1735 1735  
1736 1736          while (kbm_probe(&va, &len, &pfn, &prot) != 0 && va < high) {
1737 1737                  if (va + len >= high)
1738 1738                          panic("0x%lx byte mapping at 0x%p exceeds boot's "
1739 1739                              "legal range.", len, (void *)va);
1740 1740  
1741 1741                  while (len > 0) {
1742 1742                          pp = page_numtopp_alloc(pfn);
1743 1743                          if (pp != NULL) {
1744 1744                                  if (setaside == 0)
1745 1745                                          panic("Unexpected mapping by boot.  "
1746 1746                                              "addr=%p pfn=%lx\n",
1747 1747                                              (void *)va, pfn);
1748 1748  
1749 1749                                  pp->p_next = bootpages;
1750 1750                                  pp->p_prev = NULL;
1751 1751                                  PP_SETBOOTPAGES(pp);
1752 1752                                  if (bootpages != NULL) {
1753 1753                                          bootpages->p_prev = pp;
1754 1754                                  }
1755 1755                                  bootpages = pp;
1756 1756                                  ++boot_protect_cnt;
1757 1757                          }
1758 1758  
1759 1759                          ++pfn;
1760 1760                          len -= MMU_PAGESIZE;
1761 1761                          va += MMU_PAGESIZE;
1762 1762                  }
1763 1763          }
1764 1764          PRM_DEBUG(boot_protect_cnt);
1765 1765  }
1766 1766  
1767 1767  /*
1768 1768   *
1769 1769   */
1770 1770  static void
1771 1771  layout_kernel_va(void)
1772 1772  {
1773 1773          PRM_POINT("layout_kernel_va() starting...");
1774 1774          /*
1775 1775           * Establish the final size of the kernel's heap, size of segmap,
1776 1776           * segkp, etc.
1777 1777           */
1778 1778  
1779 1779  #if defined(__amd64)
1780 1780  
1781 1781          kpm_vbase = (caddr_t)segkpm_base;
1782 1782          if (physmax + 1 < plat_dr_physmax) {
1783 1783                  kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax));
1784 1784          } else {
1785 1785                  kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
1786 1786          }
1787 1787          if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base)
1788 1788                  panic("not enough room for kpm!");
1789 1789          PRM_DEBUG(kpm_size);
1790 1790          PRM_DEBUG(kpm_vbase);
1791 1791  
1792 1792          /*
1793 1793           * By default we create a seg_kp in 64 bit kernels, it's a little
1794 1794           * faster to access than embedding it in the heap.
1795 1795           */
1796 1796          segkp_base = (caddr_t)valloc_base + valloc_sz;
1797 1797          if (!segkp_fromheap) {
1798 1798                  size_t sz = mmu_ptob(segkpsize);
1799 1799  
1800 1800                  /*
1801 1801                   * determine size of segkp
1802 1802                   */
1803 1803                  if (sz < SEGKPMINSIZE || sz > SEGKPMAXSIZE) {
1804 1804                          sz = SEGKPDEFSIZE;
1805 1805                          cmn_err(CE_WARN, "!Illegal value for segkpsize. "
1806 1806                              "segkpsize has been reset to %ld pages",
1807 1807                              mmu_btop(sz));
1808 1808                  }
1809 1809                  sz = MIN(sz, MAX(SEGKPMINSIZE, mmu_ptob(physmem)));
1810 1810  
1811 1811                  segkpsize = mmu_btop(ROUND_UP_LPAGE(sz));
1812 1812          }
1813 1813          PRM_DEBUG(segkp_base);
1814 1814          PRM_DEBUG(segkpsize);
1815 1815  
1816 1816          /*
1817 1817           * segzio is used for ZFS cached data. It uses a distinct VA
1818 1818           * segment (from kernel heap) so that we can easily tell not to
1819 1819           * include it in kernel crash dumps on 64 bit kernels. The trick is
1820 1820           * to give it lots of VA, but not constrain the kernel heap.
1821 1821           * We scale the size of segzio linearly with physmem up to
1822 1822           * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem.
1823 1823           */
1824 1824          segzio_base = segkp_base + mmu_ptob(segkpsize);
1825 1825          if (segzio_fromheap) {
1826 1826                  segziosize = 0;
1827 1827          } else {
1828 1828                  size_t physmem_size = mmu_ptob(physmem);
1829 1829                  size_t size = (segziosize == 0) ?
1830 1830                      physmem_size : mmu_ptob(segziosize);
1831 1831  
1832 1832                  if (size < SEGZIOMINSIZE)
1833 1833                          size = SEGZIOMINSIZE;
1834 1834                  if (size > SEGZIOMAXSIZE) {
1835 1835                          size = SEGZIOMAXSIZE;
1836 1836                          if (physmem_size > size)
1837 1837                                  size += (physmem_size - size) / 2;
1838 1838                  }
1839 1839                  segziosize = mmu_btop(ROUND_UP_LPAGE(size));
1840 1840          }
1841 1841          PRM_DEBUG(segziosize);
1842 1842          PRM_DEBUG(segzio_base);
1843 1843  
1844 1844          /*
1845 1845           * Put the range of VA for device mappings next, kmdb knows to not
1846 1846           * grep in this range of addresses.
1847 1847           */
1848 1848          toxic_addr =
1849 1849              ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize));
1850 1850          PRM_DEBUG(toxic_addr);
1851 1851          segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size);
1852 1852  #else /* __i386 */
1853 1853          segmap_start = ROUND_UP_LPAGE(kernelbase);
1854 1854  #endif /* __i386 */
1855 1855          PRM_DEBUG(segmap_start);
1856 1856  
1857 1857          /*
1858 1858           * Users can change segmapsize through eeprom. If the variable
1859 1859           * is tuned through eeprom, there is no upper bound on the
1860 1860           * size of segmap.
1861 1861           */
1862 1862          segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT);
1863 1863  
1864 1864  #if defined(__i386)
1865 1865          /*
1866 1866           * 32-bit systems don't have segkpm or segkp, so segmap appears at
1867 1867           * the bottom of the kernel's address range.  Set aside space for a
1868 1868           * small red zone just below the start of segmap.
1869 1869           */
1870 1870          segmap_start += KERNEL_REDZONE_SIZE;
1871 1871          segmapsize -= KERNEL_REDZONE_SIZE;
1872 1872  #endif
1873 1873  
1874 1874          PRM_DEBUG(segmap_start);
1875 1875          PRM_DEBUG(segmapsize);
1876 1876          kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize);
1877 1877          PRM_DEBUG(kernelheap);
1878 1878          PRM_POINT("layout_kernel_va() done...");
1879 1879  }
1880 1880  
1881 1881  /*
1882 1882   * Finish initializing the VM system, now that we are no longer
1883 1883   * relying on the boot time memory allocators.
1884 1884   */
1885 1885  static void
1886 1886  startup_vm(void)
1887 1887  {
1888 1888          struct segmap_crargs a;
1889 1889  
1890 1890          extern int use_brk_lpg, use_stk_lpg;
1891 1891  
1892 1892          PRM_POINT("startup_vm() starting...");
1893 1893  
1894 1894          /*
1895 1895           * Initialize the hat layer.
1896 1896           */
1897 1897          hat_init();
1898 1898  
1899 1899          /*
1900 1900           * Do final allocations of HAT data structures that need to
1901 1901           * be allocated before quiescing the boot loader.
1902 1902           */
1903 1903          PRM_POINT("Calling hat_kern_alloc()...");
1904 1904          hat_kern_alloc((caddr_t)segmap_start, segmapsize, ekernelheap);
1905 1905          PRM_POINT("hat_kern_alloc() done");
1906 1906  
1907 1907  #ifndef __xpv
1908 1908          /*
1909 1909           * Setup Page Attribute Table
1910 1910           */
1911 1911          pat_sync();
1912 1912  #endif
1913 1913  
1914 1914          /*
1915 1915           * The next two loops are done in distinct steps in order
1916 1916           * to be sure that any page that is doubly mapped (both above
1917 1917           * KERNEL_TEXT and below kernelbase) is dealt with correctly.
1918 1918           * Note this may never happen, but it might someday.
1919 1919           */
1920 1920          bootpages = NULL;
1921 1921          PRM_POINT("Protecting boot pages");
1922 1922  
1923 1923          /*
1924 1924           * Protect any pages mapped above KERNEL_TEXT that somehow have
1925 1925           * page_t's. This can only happen if something weird allocated
1926 1926           * in this range (like kadb/kmdb).
1927 1927           */
1928 1928          protect_boot_range(KERNEL_TEXT, (uintptr_t)-1, 0);
1929 1929  
1930 1930          /*
1931 1931           * Before we can take over memory allocation/mapping from the boot
1932 1932           * loader we must remove from our free page lists any boot allocated
1933 1933           * pages that stay mapped until release_bootstrap().
1934 1934           */
1935 1935          protect_boot_range(0, kernelbase, 1);
1936 1936  
1937 1937  
1938 1938          /*
1939 1939           * Switch to running on regular HAT (not boot_mmu)
1940 1940           */
1941 1941          PRM_POINT("Calling hat_kern_setup()...");
1942 1942          hat_kern_setup();
1943 1943  
1944 1944          /*
1945 1945           * It is no longer safe to call BOP_ALLOC(), so make sure we don't.
1946 1946           */
1947 1947          bop_no_more_mem();
1948 1948  
1949 1949          PRM_POINT("hat_kern_setup() done");
1950 1950  
1951 1951          hat_cpu_online(CPU);
1952 1952  
1953 1953          /*
1954 1954           * Initialize VM system
1955 1955           */
1956 1956          PRM_POINT("Calling kvm_init()...");
1957 1957          kvm_init();
1958 1958          PRM_POINT("kvm_init() done");
1959 1959  
1960 1960          /*
1961 1961           * Tell kmdb that the VM system is now working
1962 1962           */
1963 1963          if (boothowto & RB_DEBUG)
1964 1964                  kdi_dvec_vmready();
1965 1965  
1966 1966  #if defined(__xpv)
1967 1967          /*
1968 1968           * Populate the I/O pool on domain 0
1969 1969           */
1970 1970          if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1971 1971                  extern long populate_io_pool(void);
1972 1972                  long init_io_pool_cnt;
1973 1973  
1974 1974                  PRM_POINT("Populating reserve I/O page pool");
1975 1975                  init_io_pool_cnt = populate_io_pool();
1976 1976                  PRM_DEBUG(init_io_pool_cnt);
1977 1977          }
1978 1978  #endif
1979 1979          /*
1980 1980           * Mangle the brand string etc.
1981 1981           */
1982 1982          cpuid_pass3(CPU);
1983 1983  
1984 1984  #if defined(__amd64)
1985 1985  
1986 1986          /*
1987 1987           * Create the device arena for toxic (to dtrace/kmdb) mappings.
1988 1988           */
1989 1989          device_arena = vmem_create("device", (void *)toxic_addr,
1990 1990              toxic_size, MMU_PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
1991 1991  
1992 1992  #else   /* __i386 */
1993 1993  
1994 1994          /*
1995 1995           * allocate the bit map that tracks toxic pages
1996 1996           */
1997 1997          toxic_bit_map_len = btop((ulong_t)(valloc_base - kernelbase));
1998 1998          PRM_DEBUG(toxic_bit_map_len);
1999 1999          toxic_bit_map =
2000 2000              kmem_zalloc(BT_SIZEOFMAP(toxic_bit_map_len), KM_NOSLEEP);
2001 2001          ASSERT(toxic_bit_map != NULL);
2002 2002          PRM_DEBUG(toxic_bit_map);
2003 2003  
2004 2004  #endif  /* __i386 */
2005 2005  
2006 2006  
2007 2007          /*
2008 2008           * Now that we've got more VA, as well as the ability to allocate from
2009 2009           * it, tell the debugger.
2010 2010           */
2011 2011          if (boothowto & RB_DEBUG)
2012 2012                  kdi_dvec_memavail();
2013 2013  
2014 2014          /*
2015 2015           * The following code installs a special page fault handler (#pf)
2016 2016           * to work around a pentium bug.
2017 2017           */
2018 2018  #if !defined(__amd64) && !defined(__xpv)
2019 2019          if (x86_type == X86_TYPE_P5) {
2020 2020                  desctbr_t idtr;
2021 2021                  gate_desc_t *newidt;
2022 2022  
2023 2023                  if ((newidt = kmem_zalloc(MMU_PAGESIZE, KM_NOSLEEP)) == NULL)
2024 2024                          panic("failed to install pentium_pftrap");
2025 2025  
2026 2026                  bcopy(idt0, newidt, NIDT * sizeof (*idt0));
2027 2027                  set_gatesegd(&newidt[T_PGFLT], &pentium_pftrap,
2028 2028                      KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
2029 2029  
2030 2030                  (void) as_setprot(&kas, (caddr_t)newidt, MMU_PAGESIZE,
2031 2031                      PROT_READ | PROT_EXEC);
2032 2032  
2033 2033                  CPU->cpu_idt = newidt;
2034 2034                  idtr.dtr_base = (uintptr_t)CPU->cpu_idt;
2035 2035                  idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
2036 2036                  wr_idtr(&idtr);
2037 2037          }
2038 2038  #endif  /* !__amd64 */
2039 2039  
2040 2040  #if !defined(__xpv)
2041 2041          /*
2042 2042           * Map page pfn=0 for drivers, such as kd, that need to pick up
2043 2043           * parameters left there by controllers/BIOS.
2044 2044           */
2045 2045          PRM_POINT("setup up p0_va");
2046 2046          p0_va = i86devmap(0, 1, PROT_READ);
2047 2047          PRM_DEBUG(p0_va);
2048 2048  #endif
2049 2049  
2050 2050          cmn_err(CE_CONT, "?mem = %luK (0x%lx)\n",
2051 2051              physinstalled << (MMU_PAGESHIFT - 10), ptob(physinstalled));
2052 2052  
2053 2053          /*
2054 2054           * disable automatic large pages for small memory systems or
2055 2055           * when the disable flag is set.
2056 2056           *
2057 2057           * Do not yet consider page sizes larger than 2m/4m.
2058 2058           */
2059 2059          if (!auto_lpg_disable && mmu.max_page_level > 0) {
2060 2060                  max_uheap_lpsize = LEVEL_SIZE(1);
2061 2061                  max_ustack_lpsize = LEVEL_SIZE(1);
2062 2062                  max_privmap_lpsize = LEVEL_SIZE(1);
2063 2063                  max_uidata_lpsize = LEVEL_SIZE(1);
2064 2064                  max_utext_lpsize = LEVEL_SIZE(1);
2065 2065                  max_shm_lpsize = LEVEL_SIZE(1);
2066 2066          }
2067 2067          if (physmem < privm_lpg_min_physmem || mmu.max_page_level == 0 ||
2068 2068              auto_lpg_disable) {
2069 2069                  use_brk_lpg = 0;
2070 2070                  use_stk_lpg = 0;
2071 2071          }
2072 2072          mcntl0_lpsize = LEVEL_SIZE(mmu.umax_page_level);
2073 2073  
2074 2074          PRM_POINT("Calling hat_init_finish()...");
2075 2075          hat_init_finish();
2076 2076          PRM_POINT("hat_init_finish() done");
2077 2077  
2078 2078          /*
2079 2079           * Initialize the segkp segment type.
2080 2080           */
2081 2081          rw_enter(&kas.a_lock, RW_WRITER);
2082 2082          PRM_POINT("Attaching segkp");
2083 2083          if (segkp_fromheap) {
2084 2084                  segkp->s_as = &kas;
2085 2085          } else if (seg_attach(&kas, (caddr_t)segkp_base, mmu_ptob(segkpsize),
2086 2086              segkp) < 0) {
2087 2087                  panic("startup: cannot attach segkp");
2088 2088                  /*NOTREACHED*/
2089 2089          }
2090 2090          PRM_POINT("Doing segkp_create()");
2091 2091          if (segkp_create(segkp) != 0) {
2092 2092                  panic("startup: segkp_create failed");
2093 2093                  /*NOTREACHED*/
2094 2094          }
2095 2095          PRM_DEBUG(segkp);
2096 2096          rw_exit(&kas.a_lock);
2097 2097  
2098 2098          /*
2099 2099           * kpm segment
2100 2100           */
2101 2101          segmap_kpm = 0;
2102 2102          if (kpm_desired) {
2103 2103                  kpm_init();
2104 2104                  kpm_enable = 1;
2105 2105          }
2106 2106  
2107 2107          /*
2108 2108           * Now create segmap segment.
2109 2109           */
2110 2110          rw_enter(&kas.a_lock, RW_WRITER);
2111 2111          if (seg_attach(&kas, (caddr_t)segmap_start, segmapsize, segmap) < 0) {
2112 2112                  panic("cannot attach segmap");
2113 2113                  /*NOTREACHED*/
2114 2114          }
2115 2115          PRM_DEBUG(segmap);
2116 2116  
2117 2117          a.prot = PROT_READ | PROT_WRITE;
2118 2118          a.shmsize = 0;
2119 2119          a.nfreelist = segmapfreelists;
2120 2120  
2121 2121          if (segmap_create(segmap, (caddr_t)&a) != 0)
2122 2122                  panic("segmap_create segmap");
2123 2123          rw_exit(&kas.a_lock);
2124 2124  
2125 2125          setup_vaddr_for_ppcopy(CPU);
2126 2126  
2127 2127          segdev_init();
2128 2128  #if defined(__xpv)
2129 2129          if (DOMAIN_IS_INITDOMAIN(xen_info))
2130 2130  #endif
2131 2131                  pmem_init();
2132 2132  
2133 2133          PRM_POINT("startup_vm() done");
2134 2134  }
2135 2135  
2136 2136  /*
2137 2137   * Load a tod module for the non-standard tod part found on this system.
2138 2138   */
2139 2139  static void
2140 2140  load_tod_module(char *todmod)
2141 2141  {
2142 2142          if (modload("tod", todmod) == -1)
2143 2143                  halt("Can't load TOD module");
2144 2144  }
2145 2145  
2146 2146  static void
2147 2147  startup_end(void)
2148 2148  {
2149 2149          int i;
2150 2150          extern void setx86isalist(void);
2151 2151          extern void cpu_event_init(void);
2152 2152  
2153 2153          PRM_POINT("startup_end() starting...");
2154 2154  
2155 2155          /*
2156 2156           * Perform tasks that get done after most of the VM
2157 2157           * initialization has been done but before the clock
2158 2158           * and other devices get started.
2159 2159           */
2160 2160          kern_setup1();
2161 2161  
2162 2162          /*
2163 2163           * Perform CPC initialization for this CPU.
2164 2164           */
2165 2165          kcpc_hw_init(CPU);
2166 2166  
2167 2167          /*
2168 2168           * Initialize cpu event framework.
2169 2169           */
2170 2170          cpu_event_init();
2171 2171  
2172 2172  #if defined(OPTERON_WORKAROUND_6323525)
2173 2173          if (opteron_workaround_6323525)
2174 2174                  patch_workaround_6323525();
2175 2175  #endif
2176 2176          /*
2177 2177           * If needed, load TOD module now so that ddi_get_time(9F) etc. work
2178 2178           * (For now, "needed" is defined as set tod_module_name in /etc/system)
2179 2179           */
2180 2180          if (tod_module_name != NULL) {
2181 2181                  PRM_POINT("load_tod_module()");
2182 2182                  load_tod_module(tod_module_name);
2183 2183          }
2184 2184  
2185 2185  #if defined(__xpv)
2186 2186          /*
2187 2187           * Forceload interposing TOD module for the hypervisor.
2188 2188           */
2189 2189          PRM_POINT("load_tod_module()");
2190 2190          load_tod_module("xpvtod");
2191 2191  #endif
2192 2192  
2193 2193          /*
2194 2194           * Configure the system.
2195 2195           */
2196 2196          PRM_POINT("Calling configure()...");
2197 2197          configure();            /* set up devices */
2198 2198          PRM_POINT("configure() done");
2199 2199  
2200 2200          /*
2201 2201           * We can now setup for XSAVE because fpu_probe is done in configure().
2202 2202           */
2203 2203          if (fp_save_mech == FP_XSAVE) {
2204 2204                  xsave_setup_msr(CPU);
2205 2205          }
2206 2206  
2207 2207          /*
2208 2208           * Set the isa_list string to the defined instruction sets we
2209 2209           * support.
2210 2210           */
2211 2211          setx86isalist();
2212 2212          cpu_intr_alloc(CPU, NINTR_THREADS);
2213 2213          psm_install();
2214 2214  
2215 2215          /*
2216 2216           * We're done with bootops.  We don't unmap the bootstrap yet because
2217 2217           * we're still using bootsvcs.
2218 2218           */
2219 2219          PRM_POINT("NULLing out bootops");
2220 2220          *bootopsp = (struct bootops *)NULL;
2221 2221          bootops = (struct bootops *)NULL;
2222 2222  
2223 2223  #if defined(__xpv)
2224 2224          ec_init_debug_irq();
2225 2225          xs_domu_init();
2226 2226  #endif
2227 2227  
2228 2228  #if defined(__amd64) && !defined(__xpv)
2229 2229          /*
2230 2230           * Intel IOMMU has been setup/initialized in ddi_impl.c
2231 2231           * Start it up now.
2232 2232           */
2233 2233          immu_startup();
2234 2234  #endif
2235 2235  
2236 2236          PRM_POINT("Enabling interrupts");
2237 2237          (*picinitf)();

↓ open down ↓

2237 lines elided

↑ open up ↑

2238 2238          sti();
2239 2239  #if defined(__xpv)
2240 2240          ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0);
2241 2241          xen_late_startup();
2242 2242  #endif
2243 2243  
2244 2244          (void) add_avsoftintr((void *)&softlevel1_hdl, 1, softlevel1,
2245 2245              "softlevel1", NULL, NULL); /* XXX to be moved later */
2246 2246  
2247 2247          /*
2248      -         * Register these software interrupts for ddi timer.
     2248 +         * Register software interrupt handlers for ddi_periodic_add(9F).
2249 2249           * Software interrupts up to the level 10 are supported.
2250 2250           */
2251 2251          for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
2252      -                char name[sizeof ("timer_softintr") + 2];
2253      -                (void) sprintf(name, "timer_softintr%02d", i);
     2252 +                /*
     2253 +                 * The constant string "ddi_periodic", below, is only used to
     2254 +                 * print debugging information.  Generating a dynamic string
     2255 +                 * for each soft level is a waste of kernel memory.
     2256 +                 */
2254 2257                  (void) add_avsoftintr((void *)&softlevel_hdl[i-1], i,
2255      -                    (avfunc)timer_softintr, name, (caddr_t)(uintptr_t)i, NULL);
     2258 +                    (avfunc)ddi_periodic_softintr, "ddi_periodic",
     2259 +                    (caddr_t)(uintptr_t)i, NULL);
2256 2260          }
2257 2261  
2258 2262  #if !defined(__xpv)
2259 2263          if (modload("drv", "amd_iommu") < 0) {
2260 2264                  PRM_POINT("No AMD IOMMU present\n");
2261 2265          } else if (ddi_hold_installed_driver(ddi_name_to_major(
2262 2266              "amd_iommu")) == NULL) {
2263 2267                  prom_printf("ERROR: failed to attach AMD IOMMU\n");
2264 2268          }
2265 2269  #endif

2266 2270          post_startup_cpu_fixups();
2267 2271  
2268 2272          PRM_POINT("startup_end() done");
2269 2273  }
2270 2274  
2271 2275  /*
2272 2276   * Don't remove the following 2 variables.  They are necessary
2273 2277   * for reading the hostid from the legacy file (/kernel/misc/sysinit).
2274 2278   */
2275 2279  char *_hs1107 = hw_serial;
2276 2280  ulong_t  _bdhs34;
2277 2281  
2278 2282  void
2279 2283  post_startup(void)
2280 2284  {
2281 2285          extern void cpupm_init(cpu_t *);
2282 2286          extern void cpu_event_init_cpu(cpu_t *);
2283 2287  
2284 2288          /*
2285 2289           * Set the system wide, processor-specific flags to be passed
2286 2290           * to userland via the aux vector for performance hints and
2287 2291           * instruction set extensions.
2288 2292           */
2289 2293          bind_hwcap();
2290 2294  
2291 2295  #ifdef __xpv
2292 2296          if (DOMAIN_IS_INITDOMAIN(xen_info))
2293 2297  #endif
2294 2298          {
2295 2299  #if defined(__xpv)
2296 2300                  xpv_panic_init();
2297 2301  #else
2298 2302                  /*
2299 2303                   * Startup the memory scrubber.
2300 2304                   * XXPV This should be running somewhere ..
2301 2305                   */
2302 2306                  if ((get_hwenv() & HW_VIRTUAL) == 0)
2303 2307                          memscrub_init();
2304 2308  #endif
2305 2309          }
2306 2310  
2307 2311          /*
2308 2312           * Complete CPU module initialization
2309 2313           */
2310 2314          cmi_post_startup();
2311 2315  
2312 2316          /*
2313 2317           * Perform forceloading tasks for /etc/system.
2314 2318           */
2315 2319          (void) mod_sysctl(SYS_FORCELOAD, NULL);
2316 2320  
2317 2321          /*
2318 2322           * ON4.0: Force /proc module in until clock interrupt handle fixed
2319 2323           * ON4.0: This must be fixed or restated in /etc/systems.
2320 2324           */
2321 2325          (void) modload("fs", "procfs");
2322 2326  
2323 2327          (void) i_ddi_attach_hw_nodes("pit_beep");
2324 2328  
2325 2329  #if defined(__i386)
2326 2330          /*
2327 2331           * Check for required functional Floating Point hardware,
2328 2332           * unless FP hardware explicitly disabled.
2329 2333           */
2330 2334          if (fpu_exists && (fpu_pentium_fdivbug || fp_kind == FP_NO))
2331 2335                  halt("No working FP hardware found");
2332 2336  #endif
2333 2337  
2334 2338          maxmem = freemem;
2335 2339  
2336 2340          cpu_event_init_cpu(CPU);
2337 2341          cpupm_init(CPU);
2338 2342          (void) mach_cpu_create_device_node(CPU, NULL);
2339 2343  
2340 2344          pg_init();
2341 2345  }
2342 2346  
2343 2347  static int
2344 2348  pp_in_range(page_t *pp, uint64_t low_addr, uint64_t high_addr)
2345 2349  {
2346 2350          return ((pp->p_pagenum >= btop(low_addr)) &&
2347 2351              (pp->p_pagenum < btopr(high_addr)));
2348 2352  }
2349 2353  
2350 2354  void
2351 2355  release_bootstrap(void)
2352 2356  {
2353 2357          int root_is_ramdisk;
2354 2358          page_t *pp;
2355 2359          extern void kobj_boot_unmountroot(void);
2356 2360          extern dev_t rootdev;
2357 2361  #if !defined(__xpv)
2358 2362          pfn_t   pfn;
2359 2363  #endif
2360 2364  
2361 2365          /* unmount boot ramdisk and release kmem usage */
2362 2366          kobj_boot_unmountroot();
2363 2367  
2364 2368          /*
2365 2369           * We're finished using the boot loader so free its pages.
2366 2370           */
2367 2371          PRM_POINT("Unmapping lower boot pages");
2368 2372  
2369 2373          clear_boot_mappings(0, _userlimit);
2370 2374  
2371 2375          postbootkernelbase = kernelbase;
2372 2376  
2373 2377          /*
2374 2378           * If root isn't on ramdisk, destroy the hardcoded
2375 2379           * ramdisk node now and release the memory. Else,
2376 2380           * ramdisk memory is kept in rd_pages.
2377 2381           */
2378 2382          root_is_ramdisk = (getmajor(rootdev) == ddi_name_to_major("ramdisk"));
2379 2383          if (!root_is_ramdisk) {
2380 2384                  dev_info_t *dip = ddi_find_devinfo("ramdisk", -1, 0);
2381 2385                  ASSERT(dip && ddi_get_parent(dip) == ddi_root_node());
2382 2386                  ndi_rele_devi(dip);     /* held from ddi_find_devinfo */
2383 2387                  (void) ddi_remove_child(dip, 0);
2384 2388          }
2385 2389  
2386 2390          PRM_POINT("Releasing boot pages");
2387 2391          while (bootpages) {
2388 2392                  extern uint64_t ramdisk_start, ramdisk_end;
2389 2393                  pp = bootpages;
2390 2394                  bootpages = pp->p_next;
2391 2395  
2392 2396  
2393 2397                  /* Keep pages for the lower 64K */
2394 2398                  if (pp_in_range(pp, 0, 0x40000)) {
2395 2399                          pp->p_next = lower_pages;
2396 2400                          lower_pages = pp;
2397 2401                          lower_pages_count++;
2398 2402                          continue;
2399 2403                  }
2400 2404  
2401 2405  
2402 2406                  if (root_is_ramdisk && pp_in_range(pp, ramdisk_start,
2403 2407                      ramdisk_end)) {
2404 2408                          pp->p_next = rd_pages;
2405 2409                          rd_pages = pp;
2406 2410                          continue;
2407 2411                  }
2408 2412                  pp->p_next = (struct page *)0;
2409 2413                  pp->p_prev = (struct page *)0;
2410 2414                  PP_CLRBOOTPAGES(pp);
2411 2415                  page_free(pp, 1);
2412 2416          }
2413 2417          PRM_POINT("Boot pages released");
2414 2418  
2415 2419  #if !defined(__xpv)
2416 2420  /* XXPV -- note this following bunch of code needs to be revisited in Xen 3.0 */
2417 2421          /*
2418 2422           * Find 1 page below 1 MB so that other processors can boot up or
2419 2423           * so that any processor can resume.
2420 2424           * Make sure it has a kernel VA as well as a 1:1 mapping.
2421 2425           * We should have just free'd one up.
2422 2426           */
2423 2427  
2424 2428          /*
2425 2429           * 0x10 pages is 64K.  Leave the bottom 64K alone
2426 2430           * for BIOS.
2427 2431           */
2428 2432          for (pfn = 0x10; pfn < btop(1*1024*1024); pfn++) {
2429 2433                  if (page_numtopp_alloc(pfn) == NULL)
2430 2434                          continue;
2431 2435                  rm_platter_va = i86devmap(pfn, 1,
2432 2436                      PROT_READ | PROT_WRITE | PROT_EXEC);
2433 2437                  rm_platter_pa = ptob(pfn);
2434 2438                  break;
2435 2439          }
2436 2440          if (pfn == btop(1*1024*1024) && use_mp)
2437 2441                  panic("No page below 1M available for starting "
2438 2442                      "other processors or for resuming from system-suspend");
2439 2443  #endif  /* !__xpv */
2440 2444  }
2441 2445  
2442 2446  /*
2443 2447   * Initialize the platform-specific parts of a page_t.
2444 2448   */
2445 2449  void
2446 2450  add_physmem_cb(page_t *pp, pfn_t pnum)
2447 2451  {
2448 2452          pp->p_pagenum = pnum;
2449 2453          pp->p_mapping = NULL;
2450 2454          pp->p_embed = 0;
2451 2455          pp->p_share = 0;
2452 2456          pp->p_mlentry = 0;
2453 2457  }
2454 2458  
2455 2459  /*
2456 2460   * kphysm_init() initializes physical memory.
2457 2461   */
2458 2462  static pgcnt_t
2459 2463  kphysm_init(
2460 2464          page_t *pp,
2461 2465          pgcnt_t npages)
2462 2466  {
2463 2467          struct memlist  *pmem;
2464 2468          struct memseg   *cur_memseg;
2465 2469          pfn_t           base_pfn;
2466 2470          pfn_t           end_pfn;
2467 2471          pgcnt_t         num;
2468 2472          pgcnt_t         pages_done = 0;
2469 2473          uint64_t        addr;
2470 2474          uint64_t        size;
2471 2475          extern pfn_t    ddiphysmin;
2472 2476          extern int      mnode_xwa;
2473 2477          int             ms = 0, me = 0;
2474 2478  
2475 2479          ASSERT(page_hash != NULL && page_hashsz != 0);
2476 2480  
2477 2481          cur_memseg = memseg_base;
2478 2482          for (pmem = phys_avail; pmem && npages; pmem = pmem->ml_next) {
2479 2483                  /*
2480 2484                   * In a 32 bit kernel can't use higher memory if we're
2481 2485                   * not booting in PAE mode. This check takes care of that.
2482 2486                   */
2483 2487                  addr = pmem->ml_address;
2484 2488                  size = pmem->ml_size;
2485 2489                  if (btop(addr) > physmax)
2486 2490                          continue;
2487 2491  
2488 2492                  /*
2489 2493                   * align addr and size - they may not be at page boundaries
2490 2494                   */
2491 2495                  if ((addr & MMU_PAGEOFFSET) != 0) {
2492 2496                          addr += MMU_PAGEOFFSET;
2493 2497                          addr &= ~(uint64_t)MMU_PAGEOFFSET;
2494 2498                          size -= addr - pmem->ml_address;
2495 2499                  }
2496 2500  
2497 2501                  /* only process pages below or equal to physmax */
2498 2502                  if ((btop(addr + size) - 1) > physmax)
2499 2503                          size = ptob(physmax - btop(addr) + 1);
2500 2504  
2501 2505                  num = btop(size);
2502 2506                  if (num == 0)
2503 2507                          continue;
2504 2508  
2505 2509                  if (num > npages)
2506 2510                          num = npages;
2507 2511  
2508 2512                  npages -= num;
2509 2513                  pages_done += num;
2510 2514                  base_pfn = btop(addr);
2511 2515  
2512 2516                  if (prom_debug)
2513 2517                          prom_printf("MEMSEG addr=0x%" PRIx64
2514 2518                              " pgs=0x%lx pfn 0x%lx-0x%lx\n",
2515 2519                              addr, num, base_pfn, base_pfn + num);
2516 2520  
2517 2521                  /*
2518 2522                   * Ignore pages below ddiphysmin to simplify ddi memory
2519 2523                   * allocation with non-zero addr_lo requests.
2520 2524                   */
2521 2525                  if (base_pfn < ddiphysmin) {
2522 2526                          if (base_pfn + num <= ddiphysmin)
2523 2527                                  continue;
2524 2528                          pp += (ddiphysmin - base_pfn);
2525 2529                          num -= (ddiphysmin - base_pfn);
2526 2530                          base_pfn = ddiphysmin;
2527 2531                  }
2528 2532  
2529 2533                  /*
2530 2534                   * mnode_xwa is greater than 1 when large pages regions can
2531 2535                   * cross memory node boundaries. To prevent the formation
2532 2536                   * of these large pages, configure the memsegs based on the
2533 2537                   * memory node ranges which had been made non-contiguous.
2534 2538                   */
2535 2539                  if (mnode_xwa > 1) {
2536 2540  
2537 2541                          end_pfn = base_pfn + num - 1;
2538 2542                          ms = PFN_2_MEM_NODE(base_pfn);
2539 2543                          me = PFN_2_MEM_NODE(end_pfn);
2540 2544  
2541 2545                          if (ms != me) {
2542 2546                                  /*
2543 2547                                   * current range spans more than 1 memory node.
2544 2548                                   * Set num to only the pfn range in the start
2545 2549                                   * memory node.
2546 2550                                   */
2547 2551                                  num = mem_node_config[ms].physmax - base_pfn
2548 2552                                      + 1;
2549 2553                                  ASSERT(end_pfn > mem_node_config[ms].physmax);
2550 2554                          }
2551 2555                  }
2552 2556  
2553 2557                  for (;;) {
2554 2558                          /*
2555 2559                           * Build the memsegs entry
2556 2560                           */
2557 2561                          cur_memseg->pages = pp;
2558 2562                          cur_memseg->epages = pp + num;
2559 2563                          cur_memseg->pages_base = base_pfn;
2560 2564                          cur_memseg->pages_end = base_pfn + num;
2561 2565  
2562 2566                          /*
2563 2567                           * Insert into memseg list in decreasing pfn range
2564 2568                           * order. Low memory is typically more fragmented such
2565 2569                           * that this ordering keeps the larger ranges at the
2566 2570                           * front of the list for code that searches memseg.
2567 2571                           * This ASSERTS that the memsegs coming in from boot
2568 2572                           * are in increasing physical address order and not
2569 2573                           * contiguous.
2570 2574                           */
2571 2575                          if (memsegs != NULL) {
2572 2576                                  ASSERT(cur_memseg->pages_base >=
2573 2577                                      memsegs->pages_end);
2574 2578                                  cur_memseg->next = memsegs;
2575 2579                          }
2576 2580                          memsegs = cur_memseg;
2577 2581  
2578 2582                          /*
2579 2583                           * add_physmem() initializes the PSM part of the page
2580 2584                           * struct by calling the PSM back with add_physmem_cb().
2581 2585                           * In addition it coalesces pages into larger pages as
2582 2586                           * it initializes them.
2583 2587                           */
2584 2588                          add_physmem(pp, num, base_pfn);
2585 2589                          cur_memseg++;
2586 2590                          availrmem_initial += num;
2587 2591                          availrmem += num;
2588 2592  
2589 2593                          pp += num;
2590 2594                          if (ms >= me)
2591 2595                                  break;
2592 2596  
2593 2597                          /* process next memory node range */
2594 2598                          ms++;
2595 2599                          base_pfn = mem_node_config[ms].physbase;
2596 2600                          num = MIN(mem_node_config[ms].physmax,
2597 2601                              end_pfn) - base_pfn + 1;
2598 2602                  }
2599 2603          }
2600 2604  
2601 2605          PRM_DEBUG(availrmem_initial);
2602 2606          PRM_DEBUG(availrmem);
2603 2607          PRM_DEBUG(freemem);
2604 2608          build_pfn_hash();
2605 2609          return (pages_done);
2606 2610  }
2607 2611  
2608 2612  /*
2609 2613   * Kernel VM initialization.
2610 2614   */
2611 2615  static void
2612 2616  kvm_init(void)
2613 2617  {
2614 2618          ASSERT((((uintptr_t)s_text) & MMU_PAGEOFFSET) == 0);
2615 2619  
2616 2620          /*
2617 2621           * Put the kernel segments in kernel address space.
2618 2622           */
2619 2623          rw_enter(&kas.a_lock, RW_WRITER);
2620 2624          as_avlinit(&kas);
2621 2625  
2622 2626          (void) seg_attach(&kas, s_text, e_moddata - s_text, &ktextseg);
2623 2627          (void) segkmem_create(&ktextseg);
2624 2628  
2625 2629          (void) seg_attach(&kas, (caddr_t)valloc_base, valloc_sz, &kvalloc);
2626 2630          (void) segkmem_create(&kvalloc);
2627 2631  
2628 2632          (void) seg_attach(&kas, kernelheap,
2629 2633              ekernelheap - kernelheap, &kvseg);
2630 2634          (void) segkmem_create(&kvseg);
2631 2635  
2632 2636          if (core_size > 0) {
2633 2637                  PRM_POINT("attaching kvseg_core");
2634 2638                  (void) seg_attach(&kas, (caddr_t)core_base, core_size,
2635 2639                      &kvseg_core);
2636 2640                  (void) segkmem_create(&kvseg_core);
2637 2641          }
2638 2642  
2639 2643          if (segziosize > 0) {
2640 2644                  PRM_POINT("attaching segzio");
2641 2645                  (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
2642 2646                      &kzioseg);
2643 2647                  (void) segkmem_zio_create(&kzioseg);
2644 2648  
2645 2649                  /* create zio area covering new segment */
2646 2650                  segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
2647 2651          }
2648 2652  
2649 2653          (void) seg_attach(&kas, kdi_segdebugbase, kdi_segdebugsize, &kdebugseg);
2650 2654          (void) segkmem_create(&kdebugseg);
2651 2655  
2652 2656          rw_exit(&kas.a_lock);
2653 2657  
2654 2658          /*
2655 2659           * Ensure that the red zone at kernelbase is never accessible.
2656 2660           */
2657 2661          PRM_POINT("protecting redzone");
2658 2662          (void) as_setprot(&kas, (caddr_t)kernelbase, KERNEL_REDZONE_SIZE, 0);
2659 2663  
2660 2664          /*
2661 2665           * Make the text writable so that it can be hot patched by DTrace.
2662 2666           */
2663 2667          (void) as_setprot(&kas, s_text, e_modtext - s_text,
2664 2668              PROT_READ | PROT_WRITE | PROT_EXEC);
2665 2669  
2666 2670          /*
2667 2671           * Make data writable until end.
2668 2672           */
2669 2673          (void) as_setprot(&kas, s_data, e_moddata - s_data,
2670 2674              PROT_READ | PROT_WRITE | PROT_EXEC);
2671 2675  }
2672 2676  
2673 2677  #ifndef __xpv
2674 2678  /*
2675 2679   * Solaris adds an entry for Write Combining caching to the PAT
2676 2680   */
2677 2681  static uint64_t pat_attr_reg = PAT_DEFAULT_ATTRIBUTE;
2678 2682  
2679 2683  void
2680 2684  pat_sync(void)
2681 2685  {
2682 2686          ulong_t cr0, cr0_orig, cr4;
2683 2687  
2684 2688          if (!is_x86_feature(x86_featureset, X86FSET_PAT))
2685 2689                  return;
2686 2690          cr0_orig = cr0 = getcr0();
2687 2691          cr4 = getcr4();
2688 2692  
2689 2693          /* disable caching and flush all caches and TLBs */
2690 2694          cr0 |= CR0_CD;
2691 2695          cr0 &= ~CR0_NW;
2692 2696          setcr0(cr0);
2693 2697          invalidate_cache();
2694 2698          if (cr4 & CR4_PGE) {
2695 2699                  setcr4(cr4 & ~(ulong_t)CR4_PGE);
2696 2700                  setcr4(cr4);
2697 2701          } else {
2698 2702                  reload_cr3();
2699 2703          }
2700 2704  
2701 2705          /* add our entry to the PAT */
2702 2706          wrmsr(REG_PAT, pat_attr_reg);
2703 2707  
2704 2708          /* flush TLBs and cache again, then reenable cr0 caching */
2705 2709          if (cr4 & CR4_PGE) {
2706 2710                  setcr4(cr4 & ~(ulong_t)CR4_PGE);
2707 2711                  setcr4(cr4);
2708 2712          } else {
2709 2713                  reload_cr3();
2710 2714          }
2711 2715          invalidate_cache();
2712 2716          setcr0(cr0_orig);
2713 2717  }
2714 2718  
2715 2719  #endif /* !__xpv */
2716 2720  
2717 2721  #if defined(_SOFT_HOSTID)
2718 2722  /*
2719 2723   * On platforms that do not have a hardware serial number, attempt
2720 2724   * to set one based on the contents of /etc/hostid.  If this file does
2721 2725   * not exist, assume that we are to generate a new hostid and set
2722 2726   * it in the kernel, for subsequent saving by a userland process
2723 2727   * once the system is up and the root filesystem is mounted r/w.
2724 2728   *
2725 2729   * In order to gracefully support upgrade on OpenSolaris, if
2726 2730   * /etc/hostid does not exist, we will attempt to get a serial number
2727 2731   * using the legacy method (/kernel/misc/sysinit).
2728 2732   *
2729 2733   * If that isn't present, we attempt to use an SMBIOS UUID, which is
2730 2734   * a hardware serial number.  Note that we don't automatically trust
2731 2735   * all SMBIOS UUIDs (some older platforms are defective and ship duplicate
2732 2736   * UUIDs in violation of the standard), we check against a blacklist.
2733 2737   *
2734 2738   * In an attempt to make the hostid less prone to abuse
2735 2739   * (for license circumvention, etc), we store it in /etc/hostid
2736 2740   * in rot47 format.
2737 2741   */
2738 2742  extern volatile unsigned long tenmicrodata;
2739 2743  static int atoi(char *);
2740 2744  
2741 2745  /*
2742 2746   * Set this to non-zero in /etc/system if you think your SMBIOS returns a
2743 2747   * UUID that is not unique. (Also report it so that the smbios_uuid_blacklist
2744 2748   * array can be updated.)
2745 2749   */
2746 2750  int smbios_broken_uuid = 0;
2747 2751  
2748 2752  /*
2749 2753   * List of known bad UUIDs.  This is just the lower 32-bit values, since
2750 2754   * that's what we use for the host id.  If your hostid falls here, you need
2751 2755   * to contact your hardware OEM for a fix for your BIOS.
2752 2756   */
2753 2757  static unsigned char
2754 2758  smbios_uuid_blacklist[][16] = {
2755 2759  
2756 2760          {       /* Reported bad UUID (Google search) */
2757 2761                  0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05,
2758 2762                  0x00, 0x06, 0x00, 0x07, 0x00, 0x08, 0x00, 0x09,
2759 2763          },
2760 2764          {       /* Known bad DELL UUID */
2761 2765                  0x4C, 0x4C, 0x45, 0x44, 0x00, 0x00, 0x20, 0x10,
2762 2766                  0x80, 0x20, 0x80, 0xC0, 0x4F, 0x20, 0x20, 0x20,
2763 2767          },
2764 2768          {       /* Uninitialized flash */
2765 2769                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2766 2770                  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
2767 2771          },
2768 2772          {       /* All zeros */
2769 2773                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2770 2774                  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
2771 2775          },
2772 2776  };
2773 2777  
2774 2778  static int32_t
2775 2779  uuid_to_hostid(const uint8_t *uuid)
2776 2780  {
2777 2781          /*
2778 2782           * Although the UUIDs are 128-bits, they may not distribute entropy
2779 2783           * evenly.  We would like to use SHA or MD5, but those are located
2780 2784           * in loadable modules and not available this early in boot.  As we
2781 2785           * don't need the values to be cryptographically strong, we just
2782 2786           * generate 32-bit vaue by xor'ing the various sequences together,
2783 2787           * which ensures that the enire UUID contributes to the hostid.
2784 2788           */
2785 2789          int32_t id = 0;
2786 2790  
2787 2791          /* first check against the blacklist */
2788 2792          for (int i = 0; i < (sizeof (smbios_uuid_blacklist) / 16); i++) {
2789 2793                  if (bcmp(smbios_uuid_blacklist[0], uuid, 16) == 0) {
2790 2794                          cmn_err(CE_CONT, "?Broken SMBIOS UUID. "
2791 2795                              "Contact BIOS manufacturer for repair.\n");
2792 2796                          return ((int32_t)HW_INVALID_HOSTID);
2793 2797                  }
2794 2798          }
2795 2799  
2796 2800          for (int i = 0; i < 16; i++)
2797 2801                  id ^= ((uuid[i]) << (8 * (i % sizeof (id))));
2798 2802  
2799 2803          return (id);
2800 2804  }
2801 2805  
2802 2806  static int32_t
2803 2807  set_soft_hostid(void)
2804 2808  {
2805 2809          struct _buf *file;
2806 2810          char tokbuf[MAXNAMELEN];
2807 2811          token_t token;
2808 2812          int done = 0;
2809 2813          u_longlong_t tmp;
2810 2814          int i;
2811 2815          int32_t hostid = (int32_t)HW_INVALID_HOSTID;
2812 2816          unsigned char *c;
2813 2817          hrtime_t tsc;
2814 2818          smbios_system_t smsys;
2815 2819  
2816 2820          /*
2817 2821           * If /etc/hostid file not found, we'd like to get a pseudo
2818 2822           * random number to use at the hostid.  A nice way to do this
2819 2823           * is to read the real time clock.  To remain xen-compatible,
2820 2824           * we can't poke the real hardware, so we use tsc_read() to
2821 2825           * read the real time clock.  However, there is an ominous
2822 2826           * warning in tsc_read that says it can return zero, so we
2823 2827           * deal with that possibility by falling back to using the
2824 2828           * (hopefully random enough) value in tenmicrodata.
2825 2829           */
2826 2830  
2827 2831          if ((file = kobj_open_file(hostid_file)) == (struct _buf *)-1) {
2828 2832                  /*
2829 2833                   * hostid file not found - try to load sysinit module
2830 2834                   * and see if it has a nonzero hostid value...use that
2831 2835                   * instead of generating a new hostid here if so.
2832 2836                   */
2833 2837                  if ((i = modload("misc", "sysinit")) != -1) {
2834 2838                          if (strlen(hw_serial) > 0)
2835 2839                                  hostid = (int32_t)atoi(hw_serial);
2836 2840                          (void) modunload(i);
2837 2841                  }
2838 2842  
2839 2843                  /*
2840 2844                   * We try to use the SMBIOS UUID. But not if it is blacklisted
2841 2845                   * in /etc/system.
2842 2846                   */
2843 2847                  if ((hostid == HW_INVALID_HOSTID) &&
2844 2848                      (smbios_broken_uuid == 0) &&
2845 2849                      (ksmbios != NULL) &&
2846 2850                      (smbios_info_system(ksmbios, &smsys) != SMB_ERR) &&
2847 2851                      (smsys.smbs_uuidlen >= 16)) {
2848 2852                          hostid = uuid_to_hostid(smsys.smbs_uuid);
2849 2853                  }
2850 2854  
2851 2855                  /*
2852 2856                   * Generate a "random" hostid using the clock.  These
2853 2857                   * hostids will change on each boot if the value is not
2854 2858                   * saved to a persistent /etc/hostid file.
2855 2859                   */
2856 2860                  if (hostid == HW_INVALID_HOSTID) {
2857 2861                          tsc = tsc_read();
2858 2862                          if (tsc == 0)   /* tsc_read can return zero sometimes */
2859 2863                                  hostid = (int32_t)tenmicrodata & 0x0CFFFFF;
2860 2864                          else
2861 2865                                  hostid = (int32_t)tsc & 0x0CFFFFF;
2862 2866                  }
2863 2867          } else {
2864 2868                  /* hostid file found */
2865 2869                  while (!done) {
2866 2870                          token = kobj_lex(file, tokbuf, sizeof (tokbuf));
2867 2871  
2868 2872                          switch (token) {
2869 2873                          case POUND:
2870 2874                                  /*
2871 2875                                   * skip comments
2872 2876                                   */
2873 2877                                  kobj_find_eol(file);
2874 2878                                  break;
2875 2879                          case STRING:
2876 2880                                  /*
2877 2881                                   * un-rot47 - obviously this
2878 2882                                   * nonsense is ascii-specific
2879 2883                                   */
2880 2884                                  for (c = (unsigned char *)tokbuf;
2881 2885                                      *c != '\0'; c++) {
2882 2886                                          *c += 47;
2883 2887                                          if (*c > '~')
2884 2888                                                  *c -= 94;
2885 2889                                          else if (*c < '!')
2886 2890                                                  *c += 94;
2887 2891                                  }
2888 2892                                  /*
2889 2893                                   * now we should have a real number
2890 2894                                   */
2891 2895  
2892 2896                                  if (kobj_getvalue(tokbuf, &tmp) != 0)
2893 2897                                          kobj_file_err(CE_WARN, file,
2894 2898                                              "Bad value %s for hostid",
2895 2899                                              tokbuf);
2896 2900                                  else
2897 2901                                          hostid = (int32_t)tmp;
2898 2902  
2899 2903                                  break;
2900 2904                          case EOF:
2901 2905                                  done = 1;
2902 2906                                  /* FALLTHROUGH */
2903 2907                          case NEWLINE:
2904 2908                                  kobj_newline(file);
2905 2909                                  break;
2906 2910                          default:
2907 2911                                  break;
2908 2912  
2909 2913                          }
2910 2914                  }
2911 2915                  if (hostid == HW_INVALID_HOSTID) /* didn't find a hostid */
2912 2916                          kobj_file_err(CE_WARN, file,
2913 2917                              "hostid missing or corrupt");
2914 2918  
2915 2919                  kobj_close_file(file);
2916 2920          }
2917 2921          /*
2918 2922           * hostid is now the value read from /etc/hostid, or the
2919 2923           * new hostid we generated in this routine or HW_INVALID_HOSTID if not
2920 2924           * set.
2921 2925           */
2922 2926          return (hostid);
2923 2927  }
2924 2928  
2925 2929  static int
2926 2930  atoi(char *p)
2927 2931  {
2928 2932          int i = 0;
2929 2933  
2930 2934          while (*p != '\0')
2931 2935                  i = 10 * i + (*p++ - '0');
2932 2936  
2933 2937          return (i);
2934 2938  }
2935 2939  
2936 2940  #endif /* _SOFT_HOSTID */
2937 2941  
2938 2942  void
2939 2943  get_system_configuration(void)
2940 2944  {
2941 2945          char    prop[32];
2942 2946          u_longlong_t nodes_ll, cpus_pernode_ll, lvalue;
2943 2947  
2944 2948          if (BOP_GETPROPLEN(bootops, "nodes") > sizeof (prop) ||
2945 2949              BOP_GETPROP(bootops, "nodes", prop) < 0 ||
2946 2950              kobj_getvalue(prop, &nodes_ll) == -1 ||
2947 2951              nodes_ll > MAXNODES ||
2948 2952              BOP_GETPROPLEN(bootops, "cpus_pernode") > sizeof (prop) ||
2949 2953              BOP_GETPROP(bootops, "cpus_pernode", prop) < 0 ||
2950 2954              kobj_getvalue(prop, &cpus_pernode_ll) == -1) {
2951 2955                  system_hardware.hd_nodes = 1;
2952 2956                  system_hardware.hd_cpus_per_node = 0;
2953 2957          } else {
2954 2958                  system_hardware.hd_nodes = (int)nodes_ll;
2955 2959                  system_hardware.hd_cpus_per_node = (int)cpus_pernode_ll;
2956 2960          }
2957 2961  
2958 2962          if (BOP_GETPROPLEN(bootops, "kernelbase") > sizeof (prop) ||
2959 2963              BOP_GETPROP(bootops, "kernelbase", prop) < 0 ||
2960 2964              kobj_getvalue(prop, &lvalue) == -1)
2961 2965                  eprom_kernelbase = NULL;
2962 2966          else
2963 2967                  eprom_kernelbase = (uintptr_t)lvalue;
2964 2968  
2965 2969          if (BOP_GETPROPLEN(bootops, "segmapsize") > sizeof (prop) ||
2966 2970              BOP_GETPROP(bootops, "segmapsize", prop) < 0 ||
2967 2971              kobj_getvalue(prop, &lvalue) == -1)
2968 2972                  segmapsize = SEGMAPDEFAULT;
2969 2973          else
2970 2974                  segmapsize = (uintptr_t)lvalue;
2971 2975  
2972 2976          if (BOP_GETPROPLEN(bootops, "segmapfreelists") > sizeof (prop) ||
2973 2977              BOP_GETPROP(bootops, "segmapfreelists", prop) < 0 ||
2974 2978              kobj_getvalue(prop, &lvalue) == -1)
2975 2979                  segmapfreelists = 0;    /* use segmap driver default */
2976 2980          else
2977 2981                  segmapfreelists = (int)lvalue;
2978 2982  
2979 2983          /* physmem used to be here, but moved much earlier to fakebop.c */
2980 2984  }
2981 2985  
2982 2986  /*
2983 2987   * Add to a memory list.
2984 2988   * start = start of new memory segment
2985 2989   * len = length of new memory segment in bytes
2986 2990   * new = pointer to a new struct memlist
2987 2991   * memlistp = memory list to which to add segment.
2988 2992   */
2989 2993  void
2990 2994  memlist_add(
2991 2995          uint64_t start,
2992 2996          uint64_t len,
2993 2997          struct memlist *new,
2994 2998          struct memlist **memlistp)
2995 2999  {
2996 3000          struct memlist *cur;
2997 3001          uint64_t end = start + len;
2998 3002  
2999 3003          new->ml_address = start;
3000 3004          new->ml_size = len;
3001 3005  
3002 3006          cur = *memlistp;
3003 3007  
3004 3008          while (cur) {
3005 3009                  if (cur->ml_address >= end) {
3006 3010                          new->ml_next = cur;
3007 3011                          *memlistp = new;
3008 3012                          new->ml_prev = cur->ml_prev;
3009 3013                          cur->ml_prev = new;
3010 3014                          return;
3011 3015                  }
3012 3016                  ASSERT(cur->ml_address + cur->ml_size <= start);
3013 3017                  if (cur->ml_next == NULL) {
3014 3018                          cur->ml_next = new;
3015 3019                          new->ml_prev = cur;
3016 3020                          new->ml_next = NULL;
3017 3021                          return;
3018 3022                  }
3019 3023                  memlistp = &cur->ml_next;
3020 3024                  cur = cur->ml_next;
3021 3025          }
3022 3026  }
3023 3027  
3024 3028  void
3025 3029  kobj_vmem_init(vmem_t **text_arena, vmem_t **data_arena)
3026 3030  {
3027 3031          size_t tsize = e_modtext - modtext;
3028 3032          size_t dsize = e_moddata - moddata;
3029 3033  
3030 3034          *text_arena = vmem_create("module_text", tsize ? modtext : NULL, tsize,
3031 3035              1, segkmem_alloc, segkmem_free, heaptext_arena, 0, VM_SLEEP);
3032 3036          *data_arena = vmem_create("module_data", dsize ? moddata : NULL, dsize,
3033 3037              1, segkmem_alloc, segkmem_free, heap32_arena, 0, VM_SLEEP);
3034 3038  }
3035 3039  
3036 3040  caddr_t
3037 3041  kobj_text_alloc(vmem_t *arena, size_t size)
3038 3042  {
3039 3043          return (vmem_alloc(arena, size, VM_SLEEP | VM_BESTFIT));
3040 3044  }
3041 3045  
3042 3046  /*ARGSUSED*/
3043 3047  caddr_t
3044 3048  kobj_texthole_alloc(caddr_t addr, size_t size)
3045 3049  {
3046 3050          panic("unexpected call to kobj_texthole_alloc()");
3047 3051          /*NOTREACHED*/
3048 3052          return (0);
3049 3053  }
3050 3054  
3051 3055  /*ARGSUSED*/
3052 3056  void
3053 3057  kobj_texthole_free(caddr_t addr, size_t size)
3054 3058  {
3055 3059          panic("unexpected call to kobj_texthole_free()");
3056 3060  }
3057 3061  
3058 3062  /*
3059 3063   * This is called just after configure() in startup().
3060 3064   *
3061 3065   * The ISALIST concept is a bit hopeless on Intel, because
3062 3066   * there's no guarantee of an ever-more-capable processor
3063 3067   * given that various parts of the instruction set may appear
3064 3068   * and disappear between different implementations.
3065 3069   *
3066 3070   * While it would be possible to correct it and even enhance
3067 3071   * it somewhat, the explicit hardware capability bitmask allows
3068 3072   * more flexibility.
3069 3073   *
3070 3074   * So, we just leave this alone.
3071 3075   */
3072 3076  void
3073 3077  setx86isalist(void)
3074 3078  {
3075 3079          char *tp;
3076 3080          size_t len;
3077 3081          extern char *isa_list;
3078 3082  
3079 3083  #define TBUFSIZE        1024
3080 3084  
3081 3085          tp = kmem_alloc(TBUFSIZE, KM_SLEEP);
3082 3086          *tp = '\0';
3083 3087  
3084 3088  #if defined(__amd64)
3085 3089          (void) strcpy(tp, "amd64 ");
3086 3090  #endif
3087 3091  
3088 3092          switch (x86_vendor) {
3089 3093          case X86_VENDOR_Intel:
3090 3094          case X86_VENDOR_AMD:
3091 3095          case X86_VENDOR_TM:
3092 3096                  if (is_x86_feature(x86_featureset, X86FSET_CMOV)) {
3093 3097                          /*
3094 3098                           * Pentium Pro or later
3095 3099                           */
3096 3100                          (void) strcat(tp, "pentium_pro");
3097 3101                          (void) strcat(tp,
3098 3102                              is_x86_feature(x86_featureset, X86FSET_MMX) ?
3099 3103                              "+mmx pentium_pro " : " ");
3100 3104                  }
3101 3105                  /*FALLTHROUGH*/
3102 3106          case X86_VENDOR_Cyrix:
3103 3107                  /*
3104 3108                   * The Cyrix 6x86 does not have any Pentium features
3105 3109                   * accessible while not at privilege level 0.
3106 3110                   */
3107 3111                  if (is_x86_feature(x86_featureset, X86FSET_CPUID)) {
3108 3112                          (void) strcat(tp, "pentium");
3109 3113                          (void) strcat(tp,
3110 3114                              is_x86_feature(x86_featureset, X86FSET_MMX) ?
3111 3115                              "+mmx pentium " : " ");
3112 3116                  }
3113 3117                  break;
3114 3118          default:
3115 3119                  break;
3116 3120          }
3117 3121          (void) strcat(tp, "i486 i386 i86");
3118 3122          len = strlen(tp) + 1;   /* account for NULL at end of string */
3119 3123          isa_list = strcpy(kmem_alloc(len, KM_SLEEP), tp);
3120 3124          kmem_free(tp, TBUFSIZE);
3121 3125  
3122 3126  #undef TBUFSIZE
3123 3127  }
3124 3128  
3125 3129  
3126 3130  #ifdef __amd64
3127 3131  
3128 3132  void *
3129 3133  device_arena_alloc(size_t size, int vm_flag)
3130 3134  {
3131 3135          return (vmem_alloc(device_arena, size, vm_flag));
3132 3136  }
3133 3137  
3134 3138  void
3135 3139  device_arena_free(void *vaddr, size_t size)
3136 3140  {
3137 3141          vmem_free(device_arena, vaddr, size);
3138 3142  }
3139 3143  
3140 3144  #else /* __i386 */
3141 3145  
3142 3146  void *
3143 3147  device_arena_alloc(size_t size, int vm_flag)
3144 3148  {
3145 3149          caddr_t vaddr;
3146 3150          uintptr_t v;
3147 3151          size_t  start;
3148 3152          size_t  end;
3149 3153  
3150 3154          vaddr = vmem_alloc(heap_arena, size, vm_flag);
3151 3155          if (vaddr == NULL)
3152 3156                  return (NULL);
3153 3157  
3154 3158          v = (uintptr_t)vaddr;
3155 3159          ASSERT(v >= kernelbase);
3156 3160          ASSERT(v + size <= valloc_base);
3157 3161  
3158 3162          start = btop(v - kernelbase);
3159 3163          end = btop(v + size - 1 - kernelbase);
3160 3164          ASSERT(start < toxic_bit_map_len);
3161 3165          ASSERT(end < toxic_bit_map_len);
3162 3166  
3163 3167          while (start <= end) {
3164 3168                  BT_ATOMIC_SET(toxic_bit_map, start);
3165 3169                  ++start;
3166 3170          }
3167 3171          return (vaddr);
3168 3172  }
3169 3173  
3170 3174  void
3171 3175  device_arena_free(void *vaddr, size_t size)
3172 3176  {
3173 3177          uintptr_t v = (uintptr_t)vaddr;
3174 3178          size_t  start;
3175 3179          size_t  end;
3176 3180  
3177 3181          ASSERT(v >= kernelbase);
3178 3182          ASSERT(v + size <= valloc_base);
3179 3183  
3180 3184          start = btop(v - kernelbase);
3181 3185          end = btop(v + size - 1 - kernelbase);
3182 3186          ASSERT(start < toxic_bit_map_len);
3183 3187          ASSERT(end < toxic_bit_map_len);
3184 3188  
3185 3189          while (start <= end) {
3186 3190                  ASSERT(BT_TEST(toxic_bit_map, start) != 0);
3187 3191                  BT_ATOMIC_CLEAR(toxic_bit_map, start);
3188 3192                  ++start;
3189 3193          }
3190 3194          vmem_free(heap_arena, vaddr, size);
3191 3195  }
3192 3196  
3193 3197  /*
3194 3198   * returns 1st address in range that is in device arena, or NULL
3195 3199   * if len is not NULL it returns the length of the toxic range
3196 3200   */
3197 3201  void *
3198 3202  device_arena_contains(void *vaddr, size_t size, size_t *len)
3199 3203  {
3200 3204          uintptr_t v = (uintptr_t)vaddr;
3201 3205          uintptr_t eaddr = v + size;
3202 3206          size_t start;
3203 3207          size_t end;
3204 3208  
3205 3209          /*
3206 3210           * if called very early by kmdb, just return NULL
3207 3211           */
3208 3212          if (toxic_bit_map == NULL)
3209 3213                  return (NULL);
3210 3214  
3211 3215          /*
3212 3216           * First check if we're completely outside the bitmap range.
3213 3217           */
3214 3218          if (v >= valloc_base || eaddr < kernelbase)
3215 3219                  return (NULL);
3216 3220  
3217 3221          /*
3218 3222           * Trim ends of search to look at only what the bitmap covers.
3219 3223           */
3220 3224          if (v < kernelbase)
3221 3225                  v = kernelbase;
3222 3226          start = btop(v - kernelbase);
3223 3227          end = btop(eaddr - kernelbase);
3224 3228          if (end >= toxic_bit_map_len)
3225 3229                  end = toxic_bit_map_len;
3226 3230  
3227 3231          if (bt_range(toxic_bit_map, &start, &end, end) == 0)
3228 3232                  return (NULL);
3229 3233  
3230 3234          v = kernelbase + ptob(start);
3231 3235          if (len != NULL)
3232 3236                  *len = ptob(end - start);
3233 3237          return ((void *)v);
3234 3238  }
3235 3239  
3236 3240  #endif  /* __i386 */

↓ open down ↓

971 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX