1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/machsystm.h>
  29 #include <sys/archsystm.h>
  30 #include <sys/vm.h>
  31 #include <sys/cpu.h>
  32 #include <sys/atomic.h>
  33 #include <sys/reboot.h>
  34 #include <sys/kdi.h>
  35 #include <sys/bootconf.h>
  36 #include <sys/memlist_plat.h>
  37 #include <sys/memlist_impl.h>
  38 #include <sys/prom_plat.h>
  39 #include <sys/prom_isa.h>
  40 #include <sys/autoconf.h>
  41 #include <sys/ivintr.h>
  42 #include <sys/fpu/fpusystm.h>
  43 #include <sys/iommutsb.h>
  44 #include <vm/vm_dep.h>
  45 #include <vm/seg_dev.h>
  46 #include <vm/seg_kmem.h>
  47 #include <vm/seg_kpm.h>
  48 #include <vm/seg_map.h>
  49 #include <vm/seg_kp.h>
  50 #include <sys/sysconf.h>
  51 #include <vm/hat_sfmmu.h>
  52 #include <sys/kobj.h>
  53 #include <sys/sun4asi.h>
  54 #include <sys/clconf.h>
  55 #include <sys/platform_module.h>
  56 #include <sys/panic.h>
  57 #include <sys/cpu_sgnblk_defs.h>
  58 #include <sys/clock.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/dumphdr.h>
  61 #include <sys/promif.h>
  62 #include <sys/prom_debug.h>
  63 #include <sys/traptrace.h>
  64 #include <sys/memnode.h>
  65 #include <sys/mem_cage.h>
  66 #include <sys/mmu.h>
  67 #include <sys/swap.h>
  68 
  69 extern void setup_trap_table(void);
  70 extern int cpu_intrq_setup(struct cpu *);
  71 extern void cpu_intrq_register(struct cpu *);
  72 extern void contig_mem_init(void);
  73 extern caddr_t contig_mem_prealloc(caddr_t, pgcnt_t);
  74 extern void mach_dump_buffer_init(void);
  75 extern void mach_descrip_init(void);
  76 extern void mach_descrip_startup_fini(void);
  77 extern void mach_memscrub(void);
  78 extern void mach_fpras(void);
  79 extern void mach_cpu_halt_idle(void);
  80 extern void mach_hw_copy_limit(void);
  81 extern void load_mach_drivers(void);
  82 extern void load_tod_module(void);
  83 #pragma weak load_tod_module
  84 
  85 extern int ndata_alloc_mmfsa(struct memlist *ndata);
  86 #pragma weak ndata_alloc_mmfsa
  87 
  88 extern void cif_init(void);
  89 #pragma weak cif_init
  90 
  91 extern void parse_idprom(void);
  92 extern void add_vx_handler(char *, int, void (*)(cell_t *));
  93 extern void mem_config_init(void);
  94 extern void memseg_remap_init(void);
  95 
  96 extern void mach_kpm_init(void);
  97 extern void pcf_init();
  98 extern int size_pse_array(pgcnt_t, int);
  99 extern void pg_init();
 100 
 101 /*
 102  * External Data:
 103  */
 104 extern int vac_size;    /* cache size in bytes */
 105 extern uint_t vac_mask; /* VAC alignment consistency mask */
 106 extern uint_t vac_colors;
 107 
 108 /*
 109  * Global Data Definitions:
 110  */
 111 
 112 /*
 113  * XXX - Don't port this to new architectures
 114  * A 3rd party volume manager driver (vxdm) depends on the symbol romp.
 115  * 'romp' has no use with a prom with an IEEE 1275 client interface.
 116  * The driver doesn't use the value, but it depends on the symbol.
 117  */
 118 void *romp;             /* veritas driver won't load without romp 4154976 */
 119 /*
 120  * Declare these as initialized data so we can patch them.
 121  */
 122 pgcnt_t physmem = 0;    /* memory size in pages, patch if you want less */
 123 pgcnt_t segkpsize =
 124     btop(SEGKPDEFSIZE); /* size of segkp segment in pages */
 125 uint_t segmap_percent = 6; /* Size of segmap segment */
 126 
 127 int use_cache = 1;              /* cache not reliable (605 bugs) with MP */
 128 int vac_copyback = 1;
 129 char *cache_mode = NULL;
 130 int use_mix = 1;
 131 int prom_debug = 0;
 132 
 133 caddr_t boot_tba;               /* %tba at boot - used by kmdb */
 134 uint_t  tba_taken_over = 0;
 135 
 136 caddr_t s_text;                 /* start of kernel text segment */
 137 caddr_t e_text;                 /* end of kernel text segment */
 138 caddr_t s_data;                 /* start of kernel data segment */
 139 caddr_t e_data;                 /* end of kernel data segment */
 140 
 141 caddr_t modtext;                /* beginning of module text */
 142 size_t  modtext_sz;             /* size of module text */
 143 caddr_t moddata;                /* beginning of module data reserve */
 144 caddr_t e_moddata;              /* end of module data reserve */
 145 
 146 /*
 147  * End of first block of contiguous kernel in 32-bit virtual address space
 148  */
 149 caddr_t         econtig32;      /* end of first blk of contiguous kernel */
 150 
 151 caddr_t         ncbase;         /* beginning of non-cached segment */
 152 caddr_t         ncend;          /* end of non-cached segment */
 153 
 154 size_t  ndata_remain_sz;        /* bytes from end of data to 4MB boundary */
 155 caddr_t nalloc_base;            /* beginning of nucleus allocation */
 156 caddr_t nalloc_end;             /* end of nucleus allocatable memory */
 157 caddr_t valloc_base;            /* beginning of kvalloc segment */
 158 
 159 caddr_t kmem64_base;            /* base of kernel mem segment in 64-bit space */
 160 caddr_t kmem64_end;             /* end of kernel mem segment in 64-bit space */
 161 size_t  kmem64_sz;              /* bytes in kernel mem segment, 64-bit space */
 162 caddr_t kmem64_aligned_end;     /* end of large page, overmaps 64-bit space */
 163 int     kmem64_szc;             /* page size code */
 164 uint64_t kmem64_pabase = (uint64_t)-1;  /* physical address of kmem64_base */
 165 
 166 uintptr_t shm_alignment;        /* VAC address consistency modulus */
 167 struct memlist *phys_install;   /* Total installed physical memory */
 168 struct memlist *phys_avail;     /* Available (unreserved) physical memory */
 169 struct memlist *virt_avail;     /* Available (unmapped?) virtual memory */
 170 struct memlist *nopp_list;      /* pages with no backing page structs */
 171 struct memlist ndata;           /* memlist of nucleus allocatable memory */
 172 int memexp_flag;                /* memory expansion card flag */
 173 uint64_t ecache_flushaddr;      /* physical address used for flushing E$ */
 174 pgcnt_t obp_pages;              /* Physical pages used by OBP */
 175 
 176 /*
 177  * VM data structures
 178  */
 179 long page_hashsz;               /* Size of page hash table (power of two) */
 180 unsigned int page_hashsz_shift; /* log2(page_hashsz) */
 181 struct page *pp_base;           /* Base of system page struct array */
 182 size_t pp_sz;                   /* Size in bytes of page struct array */
 183 struct page **page_hash;        /* Page hash table */
 184 pad_mutex_t *pse_mutex;         /* Locks protecting pp->p_selock */
 185 size_t pse_table_size;          /* Number of mutexes in pse_mutex[] */
 186 int pse_shift;                  /* log2(pse_table_size) */
 187 struct seg ktextseg;            /* Segment used for kernel executable image */
 188 struct seg kvalloc;             /* Segment used for "valloc" mapping */
 189 struct seg kpseg;               /* Segment used for pageable kernel virt mem */
 190 struct seg ktexthole;           /* Segment used for nucleus text hole */
 191 struct seg kmapseg;             /* Segment used for generic kernel mappings */
 192 struct seg kpmseg;              /* Segment used for physical mapping */
 193 struct seg kdebugseg;           /* Segment used for the kernel debugger */
 194 
 195 void *kpm_pp_base;              /* Base of system kpm_page array */
 196 size_t  kpm_pp_sz;              /* Size of system kpm_page array */
 197 pgcnt_t kpm_npages;             /* How many kpm pages are managed */
 198 
 199 struct seg *segkp = &kpseg; /* Pageable kernel virtual memory segment */
 200 struct seg *segkmap = &kmapseg;     /* Kernel generic mapping segment */
 201 struct seg *segkpm = &kpmseg;       /* 64bit kernel physical mapping segment */
 202 
 203 int segzio_fromheap = 0;        /* zio allocations occur from heap */
 204 caddr_t segzio_base;            /* Base address of segzio */
 205 pgcnt_t segziosize = 0;         /* size of zio segment in pages */
 206 
 207 /*
 208  * A static DR page_t VA map is reserved that can map the page structures
 209  * for a domain's entire RA space. The pages that backs this space are
 210  * dynamically allocated and need not be physically contiguous.  The DR
 211  * map size is derived from KPM size.
 212  */
 213 int ppvm_enable = 0;            /* Static virtual map for page structs */
 214 page_t *ppvm_base;              /* Base of page struct map */
 215 pgcnt_t ppvm_size = 0;          /* Size of page struct map */
 216 
 217 /*
 218  * debugger pages (if allocated)
 219  */
 220 struct vnode kdebugvp;
 221 
 222 /*
 223  * VA range available to the debugger
 224  */
 225 const caddr_t kdi_segdebugbase = (const caddr_t)SEGDEBUGBASE;
 226 const size_t kdi_segdebugsize = SEGDEBUGSIZE;
 227 
 228 /*
 229  * Segment for relocated kernel structures in 64-bit large RAM kernels
 230  */
 231 struct seg kmem64;
 232 
 233 struct memseg *memseg_free;
 234 
 235 struct vnode unused_pages_vp;
 236 
 237 /*
 238  * VM data structures allocated early during boot.
 239  */
 240 size_t pagehash_sz;
 241 uint64_t memlist_sz;
 242 
 243 char tbr_wr_addr_inited = 0;
 244 
 245 caddr_t mpo_heap32_buf = NULL;
 246 size_t  mpo_heap32_bufsz = 0;
 247 
 248 /*
 249  * Static Routines:
 250  */
 251 static int ndata_alloc_memseg(struct memlist *, size_t);
 252 static void memlist_new(uint64_t, uint64_t, struct memlist **);
 253 static void memlist_add(uint64_t, uint64_t,
 254         struct memlist **, struct memlist **);
 255 static void kphysm_init(void);
 256 static void kvm_init(void);
 257 static void install_kmem64_tte(void);
 258 
 259 static void startup_init(void);
 260 static void startup_memlist(void);
 261 static void startup_modules(void);
 262 static void startup_bop_gone(void);
 263 static void startup_vm(void);
 264 static void startup_end(void);
 265 static void setup_cage_params(void);
 266 static void startup_create_io_node(void);
 267 
 268 static pgcnt_t npages;
 269 static struct memlist *memlist;
 270 void *memlist_end;
 271 
 272 static pgcnt_t bop_alloc_pages;
 273 static caddr_t hblk_base;
 274 uint_t hblk_alloc_dynamic = 0;
 275 uint_t hblk1_min = H1MIN;
 276 
 277 
 278 /*
 279  * Hooks for unsupported platforms and down-rev firmware
 280  */
 281 int iam_positron(void);
 282 #pragma weak iam_positron
 283 static void do_prom_version_check(void);
 284 
 285 /*
 286  * After receiving a thermal interrupt, this is the number of seconds
 287  * to delay before shutting off the system, assuming
 288  * shutdown fails.  Use /etc/system to change the delay if this isn't
 289  * large enough.
 290  */
 291 int thermal_powerdown_delay = 1200;
 292 
 293 /*
 294  * Used to hold off page relocations into the cage until OBP has completed
 295  * its boot-time handoff of its resources to the kernel.
 296  */
 297 int page_relocate_ready = 0;
 298 
 299 /*
 300  * Indicate if kmem64 allocation was done in small chunks
 301  */
 302 int kmem64_smchunks = 0;
 303 
 304 /*
 305  * Enable some debugging messages concerning memory usage...
 306  */
 307 #ifdef  DEBUGGING_MEM
 308 static int debugging_mem;
 309 static void
 310 printmemlist(char *title, struct memlist *list)
 311 {
 312         if (!debugging_mem)
 313                 return;
 314 
 315         printf("%s\n", title);
 316 
 317         while (list) {
 318                 prom_printf("\taddr = 0x%x %8x, size = 0x%x %8x\n",
 319                     (uint32_t)(list->ml_address >> 32),
 320                     (uint32_t)list->ml_address,
 321                     (uint32_t)(list->ml_size >> 32),
 322                     (uint32_t)(list->ml_size));
 323                 list = list->ml_next;
 324         }
 325 }
 326 
 327 void
 328 printmemseg(struct memseg *memseg)
 329 {
 330         if (!debugging_mem)
 331                 return;
 332 
 333         printf("memseg\n");
 334 
 335         while (memseg) {
 336                 prom_printf("\tpage = 0x%p, epage = 0x%p, "
 337                     "pfn = 0x%x, epfn = 0x%x\n",
 338                     memseg->pages, memseg->epages,
 339                     memseg->pages_base, memseg->pages_end);
 340                 memseg = memseg->next;
 341         }
 342 }
 343 
 344 #define debug_pause(str)        halt((str))
 345 #define MPRINTF(str)            if (debugging_mem) prom_printf((str))
 346 #define MPRINTF1(str, a)        if (debugging_mem) prom_printf((str), (a))
 347 #define MPRINTF2(str, a, b)     if (debugging_mem) prom_printf((str), (a), (b))
 348 #define MPRINTF3(str, a, b, c) \
 349         if (debugging_mem) prom_printf((str), (a), (b), (c))
 350 #else   /* DEBUGGING_MEM */
 351 #define MPRINTF(str)
 352 #define MPRINTF1(str, a)
 353 #define MPRINTF2(str, a, b)
 354 #define MPRINTF3(str, a, b, c)
 355 #endif  /* DEBUGGING_MEM */
 356 
 357 
 358 /*
 359  *
 360  *                    Kernel's Virtual Memory Layout.
 361  *                       /-----------------------\
 362  * 0xFFFFFFFF.FFFFFFFF  -|                       |-
 363  *                       |   OBP's virtual page  |
 364  *                       |        tables         |
 365  * 0xFFFFFFFC.00000000  -|-----------------------|-
 366  *                       :                       :
 367  *                       :                       :
 368  *                      -|-----------------------|-
 369  *                       |       segzio          | (base and size vary)
 370  * 0xFFFFFE00.00000000  -|-----------------------|-
 371  *                       |                       |  Ultrasparc I/II support
 372  *                       |    segkpm segment     |  up to 2TB of physical
 373  *                       | (64-bit kernel ONLY)  |  memory, VAC has 2 colors
 374  *                       |                       |
 375  * 0xFFFFFA00.00000000  -|-----------------------|- 2TB segkpm alignment
 376  *                       :                       :
 377  *                       :                       :
 378  * 0xFFFFF810.00000000  -|-----------------------|- hole_end
 379  *                       |                       |      ^
 380  *                       |  UltraSPARC I/II call |      |
 381  *                       | bug requires an extra |      |
 382  *                       | 4 GB of space between |      |
 383  *                       |   hole and used RAM   |      |
 384  *                       |                       |      |
 385  * 0xFFFFF800.00000000  -|-----------------------|-     |
 386  *                       |                       |      |
 387  *                       | Virtual Address Hole  |   UltraSPARC
 388  *                       |  on UltraSPARC I/II   |  I/II * ONLY *
 389  *                       |                       |      |
 390  * 0x00000800.00000000  -|-----------------------|-     |
 391  *                       |                       |      |
 392  *                       |  UltraSPARC I/II call |      |
 393  *                       | bug requires an extra |      |
 394  *                       | 4 GB of space between |      |
 395  *                       |   hole and used RAM   |      |
 396  *                       |                       |      v
 397  * 0x000007FF.00000000  -|-----------------------|- hole_start -----
 398  *                       :                       :                 ^
 399  *                       :                       :                 |
 400  *                       |-----------------------|                 |
 401  *                       |                       |                 |
 402  *                       |  ecache flush area    |                 |
 403  *                       |  (twice largest e$)   |                 |
 404  *                       |                       |                 |
 405  * 0x00000XXX.XXX00000  -|-----------------------|- kmem64_        |
 406  *                       | overmapped area       |   alignend_end  |
 407  *                       | (kmem64_alignsize     |                 |
 408  *                       |  boundary)            |                 |
 409  * 0x00000XXX.XXXXXXXX  -|-----------------------|- kmem64_end     |
 410  *                       |                       |                 |
 411  *                       |   64-bit kernel ONLY  |                 |
 412  *                       |                       |                 |
 413  *                       |    kmem64 segment     |                 |
 414  *                       |                       |                 |
 415  *                       | (Relocated extra HME  |           Approximately
 416  *                       |   block allocations,  |          1 TB of virtual
 417  *                       |   memnode freelists,  |           address space
 418  *                       |    HME hash buckets,  |                 |
 419  *                       | mml_table, kpmp_table,|                 |
 420  *                       |  page_t array and     |                 |
 421  *                       |  hashblock pool to    |                 |
 422  *                       |   avoid hard-coded    |                 |
 423  *                       |     32-bit vaddr      |                 |
 424  *                       |     limitations)      |                 |
 425  *                       |                       |                 v
 426  * 0x00000700.00000000  -|-----------------------|- SYSLIMIT (kmem64_base)
 427  *                       |                       |
 428  *                       |  segkmem segment      | (SYSLIMIT - SYSBASE = 4TB)
 429  *                       |                       |
 430  * 0x00000300.00000000  -|-----------------------|- SYSBASE
 431  *                       :                       :
 432  *                       :                       :
 433  *                      -|-----------------------|-
 434  *                       |                       |
 435  *                       |  segmap segment       |   SEGMAPSIZE (1/8th physmem,
 436  *                       |                       |               256G MAX)
 437  * 0x000002a7.50000000  -|-----------------------|- SEGMAPBASE
 438  *                       :                       :
 439  *                       :                       :
 440  *                      -|-----------------------|-
 441  *                       |                       |
 442  *                       |       segkp           |    SEGKPSIZE (2GB)
 443  *                       |                       |
 444  *                       |                       |
 445  * 0x000002a1.00000000  -|-----------------------|- SEGKPBASE
 446  *                       |                       |
 447  * 0x000002a0.00000000  -|-----------------------|- MEMSCRUBBASE
 448  *                       |                       |       (SEGKPBASE - 0x400000)
 449  * 0x0000029F.FFE00000  -|-----------------------|- ARGSBASE
 450  *                       |                       |       (MEMSCRUBBASE - NCARGS)
 451  * 0x0000029F.FFD80000  -|-----------------------|- PPMAPBASE
 452  *                       |                       |       (ARGSBASE - PPMAPSIZE)
 453  * 0x0000029F.FFD00000  -|-----------------------|- PPMAP_FAST_BASE
 454  *                       |                       |
 455  * 0x0000029F.FF980000  -|-----------------------|- PIOMAPBASE
 456  *                       |                       |
 457  * 0x0000029F.FF580000  -|-----------------------|- NARG_BASE
 458  *                       :                       :
 459  *                       :                       :
 460  * 0x00000000.FFFFFFFF  -|-----------------------|- OFW_END_ADDR
 461  *                       |                       |
 462  *                       |         OBP           |
 463  *                       |                       |
 464  * 0x00000000.F0000000  -|-----------------------|- OFW_START_ADDR
 465  *                       |         kmdb          |
 466  * 0x00000000.EDD00000  -|-----------------------|- SEGDEBUGBASE
 467  *                       :                       :
 468  *                       :                       :
 469  * 0x00000000.7c000000  -|-----------------------|- SYSLIMIT32
 470  *                       |                       |
 471  *                       |  segkmem32 segment    | (SYSLIMIT32 - SYSBASE32 =
 472  *                       |                       |    ~64MB)
 473  *                      -|-----------------------|
 474  *                       |      IVSIZE           |
 475  * 0x00000000.70004000  -|-----------------------|
 476  *                       |     panicbuf          |
 477  * 0x00000000.70002000  -|-----------------------|
 478  *                       |      PAGESIZE         |
 479  * 0x00000000.70000000  -|-----------------------|- SYSBASE32
 480  *                       |       boot-time       |
 481  *                       |    temporary space    |
 482  * 0x00000000.4C000000  -|-----------------------|- BOOTTMPBASE
 483  *                       :                       :
 484  *                       :                       :
 485  *                       |                       |
 486  *                       |-----------------------|- econtig32
 487  *                       |    vm structures      |
 488  * 0x00000000.01C00000   |-----------------------|- nalloc_end
 489  *                       |         TSBs          |
 490  *                       |-----------------------|- end/nalloc_base
 491  *                       |   kernel data & bss   |
 492  * 0x00000000.01800000  -|-----------------------|
 493  *                       :   nucleus text hole   :
 494  * 0x00000000.01400000  -|-----------------------|
 495  *                       :                       :
 496  *                       |-----------------------|
 497  *                       |      module text      |
 498  *                       |-----------------------|- e_text/modtext
 499  *                       |      kernel text      |
 500  *                       |-----------------------|
 501  *                       |    trap table (48k)   |
 502  * 0x00000000.01000000  -|-----------------------|- KERNELBASE
 503  *                       | reserved for trapstat |} TSTAT_TOTAL_SIZE
 504  *                       |-----------------------|
 505  *                       |                       |
 506  *                       |        invalid        |
 507  *                       |                       |
 508  * 0x00000000.00000000  _|_______________________|
 509  *
 510  *
 511  *
 512  *                   32-bit User Virtual Memory Layout.
 513  *                       /-----------------------\
 514  *                       |                       |
 515  *                       |        invalid        |
 516  *                       |                       |
 517  *          0xFFC00000  -|-----------------------|- USERLIMIT
 518  *                       |       user stack      |
 519  *                       :                       :
 520  *                       :                       :
 521  *                       :                       :
 522  *                       |       user data       |
 523  *                      -|-----------------------|-
 524  *                       |       user text       |
 525  *          0x00002000  -|-----------------------|-
 526  *                       |       invalid         |
 527  *          0x00000000  _|_______________________|
 528  *
 529  *
 530  *
 531  *                   64-bit User Virtual Memory Layout.
 532  *                       /-----------------------\
 533  *                       |                       |
 534  *                       |        invalid        |
 535  *                       |                       |
 536  *  0xFFFFFFFF.80000000 -|-----------------------|- USERLIMIT
 537  *                       |       user stack      |
 538  *                       :                       :
 539  *                       :                       :
 540  *                       :                       :
 541  *                       |       user data       |
 542  *                      -|-----------------------|-
 543  *                       |       user text       |
 544  *  0x00000000.01000000 -|-----------------------|-
 545  *                       |       invalid         |
 546  *  0x00000000.00000000 _|_______________________|
 547  */
 548 
 549 extern caddr_t ecache_init_scrub_flush_area(caddr_t alloc_base);
 550 extern uint64_t ecache_flush_address(void);
 551 
 552 #pragma weak load_platform_modules
 553 #pragma weak plat_startup_memlist
 554 #pragma weak ecache_init_scrub_flush_area
 555 #pragma weak ecache_flush_address
 556 
 557 
 558 /*
 559  * By default the DR Cage is enabled for maximum OS
 560  * MPSS performance.  Users needing to disable the cage mechanism
 561  * can set this variable to zero via /etc/system.
 562  * Disabling the cage on systems supporting Dynamic Reconfiguration (DR)
 563  * will result in loss of DR functionality.
 564  * Platforms wishing to disable kernel Cage by default
 565  * should do so in their set_platform_defaults() routine.
 566  */
 567 int     kernel_cage_enable = 1;
 568 
 569 static void
 570 setup_cage_params(void)
 571 {
 572         void (*func)(void);
 573 
 574         func = (void (*)(void))kobj_getsymvalue("set_platform_cage_params", 0);
 575         if (func != NULL) {
 576                 (*func)();
 577                 return;
 578         }
 579 
 580         if (kernel_cage_enable == 0) {
 581                 return;
 582         }
 583         kcage_range_init(phys_avail, KCAGE_DOWN, total_pages / 256);
 584 
 585         if (kcage_on) {
 586                 cmn_err(CE_NOTE, "!Kernel Cage is ENABLED");
 587         } else {
 588                 cmn_err(CE_NOTE, "!Kernel Cage is DISABLED");
 589         }
 590 
 591 }
 592 
 593 /*
 594  * Machine-dependent startup code
 595  */
 596 void
 597 startup(void)
 598 {
 599         startup_init();
 600         if (&startup_platform)
 601                 startup_platform();
 602         startup_memlist();
 603         startup_modules();
 604         setup_cage_params();
 605         startup_bop_gone();
 606         startup_vm();
 607         startup_end();
 608 }
 609 
 610 struct regs sync_reg_buf;
 611 uint64_t sync_tt;
 612 
 613 void
 614 sync_handler(void)
 615 {
 616         struct  panic_trap_info         ti;
 617         int i;
 618 
 619         /*
 620          * Prevent trying to talk to the other CPUs since they are
 621          * sitting in the prom and won't reply.
 622          */
 623         for (i = 0; i < NCPU; i++) {
 624                 if ((i != CPU->cpu_id) && CPU_XCALL_READY(i)) {
 625                         cpu[i]->cpu_flags &= ~CPU_READY;
 626                         cpu[i]->cpu_flags |= CPU_QUIESCED;
 627                         CPUSET_DEL(cpu_ready_set, cpu[i]->cpu_id);
 628                 }
 629         }
 630 
 631         /*
 632          * Force a serial dump, since there are no CPUs to help.
 633          */
 634         dump_ncpu_low = 0;
 635 
 636         /*
 637          * We've managed to get here without going through the
 638          * normal panic code path. Try and save some useful
 639          * information.
 640          */
 641         if (!panicstr && (curthread->t_panic_trap == NULL)) {
 642                 ti.trap_type = sync_tt;
 643                 ti.trap_regs = &sync_reg_buf;
 644                 ti.trap_addr = NULL;
 645                 ti.trap_mmu_fsr = 0x0;
 646 
 647                 curthread->t_panic_trap = &ti;
 648         }
 649 
 650         /*
 651          * If we're re-entering the panic path, update the signature
 652          * block so that the SC knows we're in the second part of panic.
 653          */
 654         if (panicstr)
 655                 CPU_SIGNATURE(OS_SIG, SIGST_EXIT, SIGSUBST_DUMP, -1);
 656 
 657         nopanicdebug = 1; /* do not perform debug_enter() prior to dump */
 658         panic("sync initiated");
 659 }
 660 
 661 
 662 static void
 663 startup_init(void)
 664 {
 665         /*
 666          * We want to save the registers while we're still in OBP
 667          * so that we know they haven't been fiddled with since.
 668          * (In principle, OBP can't change them just because it
 669          * makes a callback, but we'd rather not depend on that
 670          * behavior.)
 671          */
 672         char            sync_str[] =
 673             "warning @ warning off : sync "
 674             "%%tl-c %%tstate h# %p x! "
 675             "%%g1 h# %p x! %%g2 h# %p x! %%g3 h# %p x! "
 676             "%%g4 h# %p x! %%g5 h# %p x! %%g6 h# %p x! "
 677             "%%g7 h# %p x! %%o0 h# %p x! %%o1 h# %p x! "
 678             "%%o2 h# %p x! %%o3 h# %p x! %%o4 h# %p x! "
 679             "%%o5 h# %p x! %%o6 h# %p x! %%o7 h# %p x! "
 680             "%%tl-c %%tpc h# %p x! %%tl-c %%tnpc h# %p x! "
 681             "%%y h# %p l! %%tl-c %%tt h# %p x! "
 682             "sync ; warning !";
 683 
 684         /*
 685          * 20 == num of %p substrings
 686          * 16 == max num of chars %p will expand to.
 687          */
 688         char            bp[sizeof (sync_str) + 16 * 20];
 689 
 690         /*
 691          * Initialize ptl1 stack for the 1st CPU.
 692          */
 693         ptl1_init_cpu(&cpu0);
 694 
 695         /*
 696          * Initialize the address map for cache consistent mappings
 697          * to random pages; must be done after vac_size is set.
 698          */
 699         ppmapinit();
 700 
 701         /*
 702          * Initialize the PROM callback handler.
 703          */
 704         init_vx_handler();
 705 
 706         /*
 707          * have prom call sync_callback() to handle the sync and
 708          * save some useful information which will be stored in the
 709          * core file later.
 710          */
 711         (void) sprintf((char *)bp, sync_str,
 712             (void *)&sync_reg_buf.r_tstate, (void *)&sync_reg_buf.r_g1,
 713             (void *)&sync_reg_buf.r_g2, (void *)&sync_reg_buf.r_g3,
 714             (void *)&sync_reg_buf.r_g4, (void *)&sync_reg_buf.r_g5,
 715             (void *)&sync_reg_buf.r_g6, (void *)&sync_reg_buf.r_g7,
 716             (void *)&sync_reg_buf.r_o0, (void *)&sync_reg_buf.r_o1,
 717             (void *)&sync_reg_buf.r_o2, (void *)&sync_reg_buf.r_o3,
 718             (void *)&sync_reg_buf.r_o4, (void *)&sync_reg_buf.r_o5,
 719             (void *)&sync_reg_buf.r_o6, (void *)&sync_reg_buf.r_o7,
 720             (void *)&sync_reg_buf.r_pc, (void *)&sync_reg_buf.r_npc,
 721             (void *)&sync_reg_buf.r_y, (void *)&sync_tt);
 722         prom_interpret(bp, 0, 0, 0, 0, 0);
 723         add_vx_handler("sync", 1, (void (*)(cell_t *))sync_handler);
 724 }
 725 
 726 
 727 size_t
 728 calc_pp_sz(pgcnt_t npages)
 729 {
 730 
 731         return (npages * sizeof (struct page));
 732 }
 733 
 734 size_t
 735 calc_kpmpp_sz(pgcnt_t npages)
 736 {
 737 
 738         kpm_pgshft = (kpm_smallpages == 0) ? MMU_PAGESHIFT4M : MMU_PAGESHIFT;
 739         kpm_pgsz = 1ull << kpm_pgshft;
 740         kpm_pgoff = kpm_pgsz - 1;
 741         kpmp2pshft = kpm_pgshft - PAGESHIFT;
 742         kpmpnpgs = 1 << kpmp2pshft;
 743 
 744         if (kpm_smallpages == 0) {
 745                 /*
 746                  * Avoid fragmentation problems in kphysm_init()
 747                  * by allocating for all of physical memory
 748                  */
 749                 kpm_npages = ptokpmpr(physinstalled);
 750                 return (kpm_npages * sizeof (kpm_page_t));
 751         } else {
 752                 kpm_npages = npages;
 753                 return (kpm_npages * sizeof (kpm_spage_t));
 754         }
 755 }
 756 
 757 size_t
 758 calc_pagehash_sz(pgcnt_t npages)
 759 {
 760         /* LINTED */
 761         ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), (sizeof (struct page))));
 762         /*
 763          * The page structure hash table size is a power of 2
 764          * such that the average hash chain length is PAGE_HASHAVELEN.
 765          */
 766         page_hashsz = npages / PAGE_HASHAVELEN;
 767         page_hashsz_shift = MAX((AN_VPSHIFT + VNODE_ALIGN_LOG2 + 1),
 768             highbit(page_hashsz));
 769         page_hashsz = 1 << page_hashsz_shift;
 770         return (page_hashsz * sizeof (struct page *));
 771 }
 772 
 773 int testkmem64_smchunks = 0;
 774 
 775 int
 776 alloc_kmem64(caddr_t base, caddr_t end)
 777 {
 778         int i;
 779         caddr_t aligned_end = NULL;
 780 
 781         if (testkmem64_smchunks)
 782                 return (1);
 783 
 784         /*
 785          * Make one large memory alloc after figuring out the 64-bit size. This
 786          * will enable use of the largest page size appropriate for the system
 787          * architecture.
 788          */
 789         ASSERT(mmu_exported_pagesize_mask & (1 << TTE8K));
 790         ASSERT(IS_P2ALIGNED(base, TTEBYTES(max_bootlp_tteszc)));
 791         for (i = max_bootlp_tteszc; i >= TTE8K; i--) {
 792                 size_t alloc_size, alignsize;
 793 #if !defined(C_OBP)
 794                 unsigned long long pa;
 795 #endif  /* !C_OBP */
 796 
 797                 if ((mmu_exported_pagesize_mask & (1 << i)) == 0)
 798                         continue;
 799                 alignsize = TTEBYTES(i);
 800                 kmem64_szc = i;
 801 
 802                 /* limit page size for small memory */
 803                 if (mmu_btop(alignsize) > (npages >> 2))
 804                         continue;
 805 
 806                 aligned_end = (caddr_t)roundup((uintptr_t)end, alignsize);
 807                 alloc_size = aligned_end - base;
 808 #if !defined(C_OBP)
 809                 if (prom_allocate_phys(alloc_size, alignsize, &pa) == 0) {
 810                         if (prom_claim_virt(alloc_size, base) != (caddr_t)-1) {
 811                                 kmem64_pabase = pa;
 812                                 kmem64_aligned_end = aligned_end;
 813                                 install_kmem64_tte();
 814                                 break;
 815                         } else {
 816                                 prom_free_phys(alloc_size, pa);
 817                         }
 818                 }
 819 #else   /* !C_OBP */
 820                 if (prom_alloc(base, alloc_size, alignsize) == base) {
 821                         kmem64_pabase = va_to_pa(kmem64_base);
 822                         kmem64_aligned_end = aligned_end;
 823                         break;
 824                 }
 825 #endif  /* !C_OBP */
 826                 if (i == TTE8K) {
 827 #ifdef sun4v
 828                         /* return failure to try small allocations */
 829                         return (1);
 830 #else
 831                         prom_panic("kmem64 allocation failure");
 832 #endif
 833                 }
 834         }
 835         ASSERT(aligned_end != NULL);
 836         return (0);
 837 }
 838 
 839 static prom_memlist_t *boot_physinstalled, *boot_physavail, *boot_virtavail;
 840 static size_t boot_physinstalled_len, boot_physavail_len, boot_virtavail_len;
 841 
 842 #if !defined(C_OBP)
 843 /*
 844  * Install a temporary tte handler in OBP for kmem64 area.
 845  *
 846  * We map kmem64 area with large pages before the trap table is taken
 847  * over. Since OBP makes 8K mappings, it can create 8K tlb entries in
 848  * the same area. Duplicate tlb entries with different page sizes
 849  * cause unpredicatble behavior.  To avoid this, we don't create
 850  * kmem64 mappings via BOP_ALLOC (ends up as prom_alloc() call to
 851  * OBP).  Instead, we manage translations with a temporary va>tte-data
 852  * handler (kmem64-tte).  This handler is replaced by unix-tte when
 853  * the trap table is taken over.
 854  *
 855  * The temporary handler knows the physical address of the kmem64
 856  * area. It uses the prom's pgmap@ Forth word for other addresses.
 857  *
 858  * We have to use BOP_ALLOC() method for C-OBP platforms because
 859  * pgmap@ is not defined in C-OBP. C-OBP is only used on serengeti
 860  * sun4u platforms. On sun4u we flush tlb after trap table is taken
 861  * over if we use large pages for kernel heap and kmem64. Since sun4u
 862  * prom (unlike sun4v) calls va>tte-data first for client address
 863  * translation prom's ttes for kmem64 can't get into TLB even if we
 864  * later switch to prom's trap table again. C-OBP uses 4M pages for
 865  * client mappings when possible so on all platforms we get the
 866  * benefit from large mappings for kmem64 area immediately during
 867  * boot.
 868  *
 869  * pseudo code:
 870  * if (context != 0) {
 871  *      return false
 872  * } else if (miss_va in range[kmem64_base, kmem64_end)) {
 873  *      tte = tte_template +
 874  *              (((miss_va & pagemask) - kmem64_base));
 875  *      return tte, true
 876  * } else {
 877  *      return pgmap@ result
 878  * }
 879  */
 880 char kmem64_obp_str[] =
 881         "h# %lx constant kmem64-base "
 882         "h# %lx constant kmem64-end "
 883         "h# %lx constant kmem64-pagemask "
 884         "h# %lx constant kmem64-template "
 885 
 886         ": kmem64-tte ( addr cnum -- false | tte-data true ) "
 887         "    if                                       ( addr ) "
 888         "       drop false exit then                  ( false ) "
 889         "    dup  kmem64-base kmem64-end  within  if  ( addr ) "
 890         "       kmem64-pagemask and                   ( addr' ) "
 891         "       kmem64-base -                         ( addr' ) "
 892         "       kmem64-template +                     ( tte ) "
 893         "       true                                  ( tte true ) "
 894         "    else                                     ( addr ) "
 895         "       pgmap@                                ( tte ) "
 896         "       dup 0< if true else drop false then   ( tte true  |  false ) "
 897         "    then                                     ( tte true  |  false ) "
 898         "; "
 899 
 900         "' kmem64-tte is va>tte-data "
 901 ;
 902 
 903 static void
 904 install_kmem64_tte()
 905 {
 906         char b[sizeof (kmem64_obp_str) + (4 * 16)];
 907         tte_t tte;
 908 
 909         PRM_DEBUG(kmem64_pabase);
 910         PRM_DEBUG(kmem64_szc);
 911         sfmmu_memtte(&tte, kmem64_pabase >> MMU_PAGESHIFT,
 912             PROC_DATA | HAT_NOSYNC, kmem64_szc);
 913         PRM_DEBUG(tte.ll);
 914         (void) sprintf(b, kmem64_obp_str,
 915             kmem64_base, kmem64_end, TTE_PAGEMASK(kmem64_szc), tte.ll);
 916         ASSERT(strlen(b) < sizeof (b));
 917         prom_interpret(b, 0, 0, 0, 0, 0);
 918 }
 919 #endif  /* !C_OBP */
 920 
 921 /*
 922  * As OBP takes up some RAM when the system boots, pages will already be "lost"
 923  * to the system and reflected in npages by the time we see it.
 924  *
 925  * We only want to allocate kernel structures in the 64-bit virtual address
 926  * space on systems with enough RAM to make the overhead of keeping track of
 927  * an extra kernel memory segment worthwhile.
 928  *
 929  * Since OBP has already performed its memory allocations by this point, if we
 930  * have more than MINMOVE_RAM_MB MB of RAM left free, go ahead and map
 931  * memory in the 64-bit virtual address space; otherwise keep allocations
 932  * contiguous with we've mapped so far in the 32-bit virtual address space.
 933  */
 934 #define MINMOVE_RAM_MB  ((size_t)1900)
 935 #define MB_TO_BYTES(mb) ((mb) * 1048576ul)
 936 #define BYTES_TO_MB(b) ((b) / 1048576ul)
 937 
 938 pgcnt_t tune_npages = (pgcnt_t)
 939         (MB_TO_BYTES(MINMOVE_RAM_MB)/ (size_t)MMU_PAGESIZE);
 940 
 941 #pragma weak page_set_colorequiv_arr_cpu
 942 extern void page_set_colorequiv_arr_cpu(void);
 943 extern void page_set_colorequiv_arr(void);
 944 
 945 static pgcnt_t ramdisk_npages;
 946 static struct memlist *old_phys_avail;
 947 
 948 kcage_dir_t kcage_startup_dir = KCAGE_DOWN;
 949 
 950 static void
 951 startup_memlist(void)
 952 {
 953         size_t hmehash_sz, pagelist_sz, tt_sz;
 954         size_t psetable_sz;
 955         caddr_t alloc_base;
 956         caddr_t memspace;
 957         struct memlist *cur;
 958         size_t syslimit = (size_t)SYSLIMIT;
 959         size_t sysbase = (size_t)SYSBASE;
 960 
 961         /*
 962          * Initialize enough of the system to allow kmem_alloc to work by
 963          * calling boot to allocate its memory until the time that
 964          * kvm_init is completed.  The page structs are allocated after
 965          * rounding up end to the nearest page boundary; the memsegs are
 966          * initialized and the space they use comes from the kernel heap.
 967          * With appropriate initialization, they can be reallocated later
 968          * to a size appropriate for the machine's configuration.
 969          *
 970          * At this point, memory is allocated for things that will never
 971          * need to be freed, this used to be "valloced".  This allows a
 972          * savings as the pages don't need page structures to describe
 973          * them because them will not be managed by the vm system.
 974          */
 975 
 976         /*
 977          * We're loaded by boot with the following configuration (as
 978          * specified in the sun4u/conf/Mapfile):
 979          *
 980          *      text:           4 MB chunk aligned on a 4MB boundary
 981          *      data & bss: 4 MB chunk aligned on a 4MB boundary
 982          *
 983          * These two chunks will eventually be mapped by 2 locked 4MB
 984          * ttes and will represent the nucleus of the kernel.  This gives
 985          * us some free space that is already allocated, some or all of
 986          * which is made available to kernel module text.
 987          *
 988          * The free space in the data-bss chunk is used for nucleus
 989          * allocatable data structures and we reserve it using the
 990          * nalloc_base and nalloc_end variables.  This space is currently
 991          * being used for hat data structures required for tlb miss
 992          * handling operations.  We align nalloc_base to a l2 cache
 993          * linesize because this is the line size the hardware uses to
 994          * maintain cache coherency.
 995          * 512K is carved out for module data.
 996          */
 997 
 998         moddata = (caddr_t)roundup((uintptr_t)e_data, MMU_PAGESIZE);
 999         e_moddata = moddata + MODDATA;
1000         nalloc_base = e_moddata;
1001 
1002         nalloc_end = (caddr_t)roundup((uintptr_t)nalloc_base, MMU_PAGESIZE4M);
1003         valloc_base = nalloc_base;
1004 
1005         /*
1006          * Calculate the start of the data segment.
1007          */
1008         if (((uintptr_t)e_moddata & MMU_PAGEMASK4M) != (uintptr_t)s_data)
1009                 prom_panic("nucleus data overflow");
1010 
1011         PRM_DEBUG(moddata);
1012         PRM_DEBUG(nalloc_base);
1013         PRM_DEBUG(nalloc_end);
1014 
1015         /*
1016          * Remember any slop after e_text so we can give it to the modules.
1017          */
1018         PRM_DEBUG(e_text);
1019         modtext = (caddr_t)roundup((uintptr_t)e_text, MMU_PAGESIZE);
1020         if (((uintptr_t)e_text & MMU_PAGEMASK4M) != (uintptr_t)s_text)
1021                 prom_panic("nucleus text overflow");
1022         modtext_sz = (caddr_t)roundup((uintptr_t)modtext, MMU_PAGESIZE4M) -
1023             modtext;
1024         PRM_DEBUG(modtext);
1025         PRM_DEBUG(modtext_sz);
1026 
1027         init_boot_memlists();
1028         copy_boot_memlists(&boot_physinstalled, &boot_physinstalled_len,
1029             &boot_physavail, &boot_physavail_len,
1030             &boot_virtavail, &boot_virtavail_len);
1031 
1032         /*
1033          * Remember what the physically available highest page is
1034          * so that dumpsys works properly, and find out how much
1035          * memory is installed.
1036          */
1037         installed_top_size_memlist_array(boot_physinstalled,
1038             boot_physinstalled_len, &physmax, &physinstalled);
1039         PRM_DEBUG(physinstalled);
1040         PRM_DEBUG(physmax);
1041 
1042         /* Fill out memory nodes config structure */
1043         startup_build_mem_nodes(boot_physinstalled, boot_physinstalled_len);
1044 
1045         /*
1046          * npages is the maximum of available physical memory possible.
1047          * (ie. it will never be more than this)
1048          *
1049          * When we boot from a ramdisk, the ramdisk memory isn't free, so
1050          * using phys_avail will underestimate what will end up being freed.
1051          * A better initial guess is just total memory minus the kernel text
1052          */
1053         npages = physinstalled - btop(MMU_PAGESIZE4M);
1054 
1055         /*
1056          * First allocate things that can go in the nucleus data page
1057          * (fault status, TSBs, dmv, CPUs)
1058          */
1059         ndata_alloc_init(&ndata, (uintptr_t)nalloc_base, (uintptr_t)nalloc_end);
1060 
1061         if ((&ndata_alloc_mmfsa != NULL) && (ndata_alloc_mmfsa(&ndata) != 0))
1062                 cmn_err(CE_PANIC, "no more nucleus memory after mfsa alloc");
1063 
1064         if (ndata_alloc_tsbs(&ndata, npages) != 0)
1065                 cmn_err(CE_PANIC, "no more nucleus memory after tsbs alloc");
1066 
1067         if (ndata_alloc_dmv(&ndata) != 0)
1068                 cmn_err(CE_PANIC, "no more nucleus memory after dmv alloc");
1069 
1070         if (ndata_alloc_page_mutexs(&ndata) != 0)
1071                 cmn_err(CE_PANIC,
1072                     "no more nucleus memory after page free lists alloc");
1073 
1074         if (ndata_alloc_hat(&ndata) != 0)
1075                 cmn_err(CE_PANIC, "no more nucleus memory after hat alloc");
1076 
1077         if (ndata_alloc_memseg(&ndata, boot_physavail_len) != 0)
1078                 cmn_err(CE_PANIC, "no more nucleus memory after memseg alloc");
1079 
1080         /*
1081          * WARNING WARNING WARNING WARNING WARNING WARNING WARNING
1082          *
1083          * There are comments all over the SFMMU code warning of dire
1084          * consequences if the TSBs are moved out of 32-bit space.  This
1085          * is largely because the asm code uses "sethi %hi(addr)"-type
1086          * instructions which will not provide the expected result if the
1087          * address is a 64-bit one.
1088          *
1089          * WARNING WARNING WARNING WARNING WARNING WARNING WARNING
1090          */
1091         alloc_base = (caddr_t)roundup((uintptr_t)nalloc_end, MMU_PAGESIZE);
1092         PRM_DEBUG(alloc_base);
1093 
1094         alloc_base = sfmmu_ktsb_alloc(alloc_base);
1095         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1096         PRM_DEBUG(alloc_base);
1097 
1098         /*
1099          * Allocate IOMMU TSB array.  We do this here so that the physical
1100          * memory gets deducted from the PROM's physical memory list.
1101          */
1102         alloc_base = iommu_tsb_init(alloc_base);
1103         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1104         PRM_DEBUG(alloc_base);
1105 
1106         /*
1107          * Allow for an early allocation of physically contiguous memory.
1108          */
1109         alloc_base = contig_mem_prealloc(alloc_base, npages);
1110 
1111         /*
1112          * Platforms like Starcat and OPL need special structures assigned in
1113          * 32-bit virtual address space because their probing routines execute
1114          * FCode, and FCode can't handle 64-bit virtual addresses...
1115          */
1116         if (&plat_startup_memlist) {
1117                 alloc_base = plat_startup_memlist(alloc_base);
1118                 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base,
1119                     ecache_alignsize);
1120                 PRM_DEBUG(alloc_base);
1121         }
1122 
1123         /*
1124          * Save off where the contiguous allocations to date have ended
1125          * in econtig32.
1126          */
1127         econtig32 = alloc_base;
1128         PRM_DEBUG(econtig32);
1129         if (econtig32 > (caddr_t)KERNEL_LIMIT32)
1130                 cmn_err(CE_PANIC, "econtig32 too big");
1131 
1132         pp_sz = calc_pp_sz(npages);
1133         PRM_DEBUG(pp_sz);
1134         if (kpm_enable) {
1135                 kpm_pp_sz = calc_kpmpp_sz(npages);
1136                 PRM_DEBUG(kpm_pp_sz);
1137         }
1138 
1139         hmehash_sz = calc_hmehash_sz(npages);
1140         PRM_DEBUG(hmehash_sz);
1141 
1142         pagehash_sz = calc_pagehash_sz(npages);
1143         PRM_DEBUG(pagehash_sz);
1144 
1145         pagelist_sz = calc_free_pagelist_sz();
1146         PRM_DEBUG(pagelist_sz);
1147 
1148 #ifdef  TRAPTRACE
1149         tt_sz = calc_traptrace_sz();
1150         PRM_DEBUG(tt_sz);
1151 #else
1152         tt_sz = 0;
1153 #endif  /* TRAPTRACE */
1154 
1155         /*
1156          * Place the array that protects pp->p_selock in the kmem64 wad.
1157          */
1158         pse_shift = size_pse_array(npages, max_ncpus);
1159         PRM_DEBUG(pse_shift);
1160         pse_table_size = 1 << pse_shift;
1161         PRM_DEBUG(pse_table_size);
1162         psetable_sz = roundup(
1163             pse_table_size * sizeof (pad_mutex_t), ecache_alignsize);
1164         PRM_DEBUG(psetable_sz);
1165 
1166         /*
1167          * Now allocate the whole wad
1168          */
1169         kmem64_sz = pp_sz + kpm_pp_sz + hmehash_sz + pagehash_sz +
1170             pagelist_sz + tt_sz + psetable_sz;
1171         kmem64_sz = roundup(kmem64_sz, PAGESIZE);
1172         kmem64_base = (caddr_t)syslimit;
1173         kmem64_end = kmem64_base + kmem64_sz;
1174         if (alloc_kmem64(kmem64_base, kmem64_end)) {
1175                 /*
1176                  * Attempt for kmem64 to allocate one big
1177                  * contiguous chunk of memory failed.
1178                  * We get here because we are sun4v.
1179                  * We will proceed by breaking up
1180                  * the allocation into two attempts.
1181                  * First, we allocate kpm_pp_sz, hmehash_sz,
1182                  * pagehash_sz, pagelist_sz, tt_sz & psetable_sz as
1183                  * one contiguous chunk. This is a much smaller
1184                  * chunk and we should get it, if not we panic.
1185                  * Note that hmehash and tt need to be physically
1186                  * (in the real address sense) contiguous.
1187                  * Next, we use bop_alloc_chunk() to
1188                  * to allocate the page_t structures.
1189                  * This will allow the page_t to be allocated
1190                  * in multiple smaller chunks.
1191                  * In doing so, the assumption that page_t is
1192                  * physically contiguous no longer hold, this is ok
1193                  * for sun4v but not for sun4u.
1194                  */
1195                 size_t  tmp_size;
1196                 caddr_t tmp_base;
1197 
1198                 pp_sz  = roundup(pp_sz, PAGESIZE);
1199 
1200                 /*
1201                  * Allocate kpm_pp_sz, hmehash_sz,
1202                  * pagehash_sz, pagelist_sz, tt_sz & psetable_sz
1203                  */
1204                 tmp_base = kmem64_base + pp_sz;
1205                 tmp_size = roundup(kpm_pp_sz + hmehash_sz + pagehash_sz +
1206                     pagelist_sz + tt_sz + psetable_sz, PAGESIZE);
1207                 if (prom_alloc(tmp_base, tmp_size, PAGESIZE) == 0)
1208                         prom_panic("kmem64 prom_alloc contig failed");
1209                 PRM_DEBUG(tmp_base);
1210                 PRM_DEBUG(tmp_size);
1211 
1212                 /*
1213                  * Allocate the page_ts
1214                  */
1215                 if (bop_alloc_chunk(kmem64_base, pp_sz, PAGESIZE) == 0)
1216                         prom_panic("kmem64 bop_alloc_chunk page_t failed");
1217                 PRM_DEBUG(kmem64_base);
1218                 PRM_DEBUG(pp_sz);
1219 
1220                 kmem64_aligned_end = kmem64_base + pp_sz + tmp_size;
1221                 ASSERT(kmem64_aligned_end >= kmem64_end);
1222 
1223                 kmem64_smchunks = 1;
1224         } else {
1225 
1226                 /*
1227                  * We need to adjust pp_sz for the normal
1228                  * case where kmem64 can allocate one large chunk
1229                  */
1230                 if (kpm_smallpages == 0) {
1231                         npages -= kmem64_sz / (PAGESIZE + sizeof (struct page));
1232                 } else {
1233                         npages -= kmem64_sz / (PAGESIZE + sizeof (struct page) +
1234                             sizeof (kpm_spage_t));
1235                 }
1236                 pp_sz = npages * sizeof (struct page);
1237         }
1238 
1239         if (kmem64_aligned_end > (hole_start ? hole_start : kpm_vbase))
1240                 cmn_err(CE_PANIC, "not enough kmem64 space");
1241         PRM_DEBUG(kmem64_base);
1242         PRM_DEBUG(kmem64_end);
1243         PRM_DEBUG(kmem64_aligned_end);
1244 
1245         /*
1246          * ... and divy it up
1247          */
1248         alloc_base = kmem64_base;
1249 
1250         pp_base = (page_t *)alloc_base;
1251         alloc_base += pp_sz;
1252         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1253         PRM_DEBUG(pp_base);
1254         PRM_DEBUG(npages);
1255 
1256         if (kpm_enable) {
1257                 kpm_pp_base = alloc_base;
1258                 if (kpm_smallpages == 0) {
1259                         /* kpm_npages based on physinstalled, don't reset */
1260                         kpm_pp_sz = kpm_npages * sizeof (kpm_page_t);
1261                 } else {
1262                         kpm_npages = ptokpmpr(npages);
1263                         kpm_pp_sz = kpm_npages * sizeof (kpm_spage_t);
1264                 }
1265                 alloc_base += kpm_pp_sz;
1266                 alloc_base =
1267                     (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1268                 PRM_DEBUG(kpm_pp_base);
1269         }
1270 
1271         alloc_base = alloc_hmehash(alloc_base);
1272         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1273         PRM_DEBUG(alloc_base);
1274 
1275         page_hash = (page_t **)alloc_base;
1276         alloc_base += pagehash_sz;
1277         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1278         PRM_DEBUG(page_hash);
1279 
1280         alloc_base = alloc_page_freelists(alloc_base);
1281         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1282         PRM_DEBUG(alloc_base);
1283 
1284 #ifdef  TRAPTRACE
1285         ttrace_buf = alloc_base;
1286         alloc_base += tt_sz;
1287         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1288         PRM_DEBUG(alloc_base);
1289 #endif  /* TRAPTRACE */
1290 
1291         pse_mutex = (pad_mutex_t *)alloc_base;
1292         alloc_base += psetable_sz;
1293         alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, ecache_alignsize);
1294         PRM_DEBUG(alloc_base);
1295 
1296         /*
1297          * Note that if we use small chunk allocations for
1298          * kmem64, we need to ensure kmem64_end is the same as
1299          * kmem64_aligned_end to prevent subsequent logic from
1300          * trying to reuse the overmapping.
1301          * Otherwise we adjust kmem64_end to what we really allocated.
1302          */
1303         if (kmem64_smchunks) {
1304                 kmem64_end = kmem64_aligned_end;
1305         } else {
1306                 kmem64_end = (caddr_t)roundup((uintptr_t)alloc_base, PAGESIZE);
1307         }
1308         kmem64_sz = kmem64_end - kmem64_base;
1309 
1310         if (&ecache_init_scrub_flush_area) {
1311                 alloc_base = ecache_init_scrub_flush_area(kmem64_aligned_end);
1312                 ASSERT(alloc_base <= (hole_start ? hole_start : kpm_vbase));
1313         }
1314 
1315         /*
1316          * If physmem is patched to be non-zero, use it instead of
1317          * the monitor value unless physmem is larger than the total
1318          * amount of memory on hand.
1319          */
1320         if (physmem == 0 || physmem > npages)
1321                 physmem = npages;
1322 
1323         /*
1324          * root_is_ramdisk is set via /etc/system when the ramdisk miniroot
1325          * is mounted as root. This memory is held down by OBP and unlike
1326          * the stub boot_archive is never released.
1327          *
1328          * In order to get things sized correctly on lower memory
1329          * machines (where the memory used by the ramdisk represents
1330          * a significant portion of memory), physmem is adjusted.
1331          *
1332          * This is done by subtracting the ramdisk_size which is set
1333          * to the size of the ramdisk (in Kb) in /etc/system at the
1334          * time the miniroot archive is constructed.
1335          */
1336         if (root_is_ramdisk == B_TRUE) {
1337                 ramdisk_npages = (ramdisk_size * 1024) / PAGESIZE;
1338                 physmem -= ramdisk_npages;
1339         }
1340 
1341         if (kpm_enable && (ndata_alloc_kpm(&ndata, kpm_npages) != 0))
1342                 cmn_err(CE_PANIC, "no more nucleus memory after kpm alloc");
1343 
1344         /*
1345          * Allocate space for the interrupt vector table.
1346          */
1347         memspace = prom_alloc((caddr_t)intr_vec_table, IVSIZE, MMU_PAGESIZE);
1348         if (memspace != (caddr_t)intr_vec_table)
1349                 prom_panic("interrupt vector table allocation failure");
1350 
1351         /*
1352          * Between now and when we finish copying in the memory lists,
1353          * allocations happen so the space gets fragmented and the
1354          * lists longer.  Leave enough space for lists twice as
1355          * long as we have now; then roundup to a pagesize.
1356          */
1357         memlist_sz = sizeof (struct memlist) * (prom_phys_installed_len() +
1358             prom_phys_avail_len() + prom_virt_avail_len());
1359         memlist_sz *= 2;
1360         memlist_sz = roundup(memlist_sz, PAGESIZE);
1361         memspace = ndata_alloc(&ndata, memlist_sz, ecache_alignsize);
1362         if (memspace == NULL)
1363                 cmn_err(CE_PANIC, "no more nucleus memory after memlist alloc");
1364 
1365         memlist = (struct memlist *)memspace;
1366         memlist_end = (char *)memspace + memlist_sz;
1367         PRM_DEBUG(memlist);
1368         PRM_DEBUG(memlist_end);
1369 
1370         PRM_DEBUG(sysbase);
1371         PRM_DEBUG(syslimit);
1372         kernelheap_init((void *)sysbase, (void *)syslimit,
1373             (caddr_t)sysbase + PAGESIZE, NULL, NULL);
1374 
1375         /*
1376          * Take the most current snapshot we can by calling mem-update.
1377          */
1378         copy_boot_memlists(&boot_physinstalled, &boot_physinstalled_len,
1379             &boot_physavail, &boot_physavail_len,
1380             &boot_virtavail, &boot_virtavail_len);
1381 
1382         /*
1383          * Remove the space used by prom_alloc from the kernel heap
1384          * plus the area actually used by the OBP (if any)
1385          * ignoring virtual addresses in virt_avail, above syslimit.
1386          */
1387         virt_avail = memlist;
1388         copy_memlist(boot_virtavail, boot_virtavail_len, &memlist);
1389 
1390         for (cur = virt_avail; cur->ml_next; cur = cur->ml_next) {
1391                 uint64_t range_base, range_size;
1392 
1393                 if ((range_base = cur->ml_address + cur->ml_size) <
1394                     (uint64_t)sysbase)
1395                         continue;
1396                 if (range_base >= (uint64_t)syslimit)
1397                         break;
1398                 /*
1399                  * Limit the range to end at syslimit.
1400                  */
1401                 range_size = MIN(cur->ml_next->ml_address,
1402                     (uint64_t)syslimit) - range_base;
1403                 (void) vmem_xalloc(heap_arena, (size_t)range_size, PAGESIZE,
1404                     0, 0, (void *)range_base, (void *)(range_base + range_size),
1405                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
1406         }
1407 
1408         phys_avail = memlist;
1409         copy_memlist(boot_physavail, boot_physavail_len, &memlist);
1410 
1411         /*
1412          * Add any extra memory at the end of the ndata region if there's at
1413          * least a page to add.  There might be a few more pages available in
1414          * the middle of the ndata region, but for now they are ignored.
1415          */
1416         nalloc_base = ndata_extra_base(&ndata, MMU_PAGESIZE, nalloc_end);
1417         if (nalloc_base == NULL)
1418                 nalloc_base = nalloc_end;
1419         ndata_remain_sz = nalloc_end - nalloc_base;
1420 
1421         /*
1422          * Copy physinstalled list into kernel space.
1423          */
1424         phys_install = memlist;
1425         copy_memlist(boot_physinstalled, boot_physinstalled_len, &memlist);
1426 
1427         /*
1428          * Create list of physical addrs we don't need pp's for:
1429          * kernel text 4M page
1430          * kernel data 4M page - ndata_remain_sz
1431          * kmem64 pages
1432          *
1433          * NB if adding any pages here, make sure no kpm page
1434          * overlaps can occur (see ASSERTs in kphysm_memsegs)
1435          */
1436         nopp_list = memlist;
1437         memlist_new(va_to_pa(s_text), MMU_PAGESIZE4M, &memlist);
1438         memlist_add(va_to_pa(s_data), MMU_PAGESIZE4M - ndata_remain_sz,
1439             &memlist, &nopp_list);
1440 
1441         /* Don't add to nopp_list if kmem64 was allocated in smchunks */
1442         if (!kmem64_smchunks)
1443                 memlist_add(kmem64_pabase, kmem64_sz, &memlist, &nopp_list);
1444 
1445         if ((caddr_t)memlist > (memspace + memlist_sz))
1446                 prom_panic("memlist overflow");
1447 
1448         /*
1449          * Size the pcf array based on the number of cpus in the box at
1450          * boot time.
1451          */
1452         pcf_init();
1453 
1454         /*
1455          * Initialize the page structures from the memory lists.
1456          */
1457         kphysm_init();
1458 
1459         availrmem_initial = availrmem = freemem;
1460         PRM_DEBUG(availrmem);
1461 
1462         /*
1463          * Some of the locks depend on page_hashsz being set!
1464          * kmem_init() depends on this; so, keep it here.
1465          */
1466         page_lock_init();
1467 
1468         /*
1469          * Initialize kernel memory allocator.
1470          */
1471         kmem_init();
1472 
1473         /*
1474          * Factor in colorequiv to check additional 'equivalent' bins
1475          */
1476         if (&page_set_colorequiv_arr_cpu != NULL)
1477                 page_set_colorequiv_arr_cpu();
1478         else
1479                 page_set_colorequiv_arr();
1480 
1481         /*
1482          * Initialize bp_mapin().
1483          */
1484         bp_init(shm_alignment, HAT_STRICTORDER);
1485 
1486         /*
1487          * Reserve space for MPO mblock structs from the 32-bit heap.
1488          */
1489 
1490         if (mpo_heap32_bufsz > (size_t)0) {
1491                 (void) vmem_xalloc(heap32_arena, mpo_heap32_bufsz,
1492                     PAGESIZE, 0, 0, mpo_heap32_buf,
1493                     mpo_heap32_buf + mpo_heap32_bufsz,
1494                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
1495         }
1496         mem_config_init();
1497 }
1498 
1499 static void
1500 startup_modules(void)
1501 {
1502         int nhblk1, nhblk8;
1503         size_t  nhblksz;
1504         pgcnt_t pages_per_hblk;
1505         size_t hme8blk_sz, hme1blk_sz;
1506 
1507         /*
1508          * The system file /etc/system was read already under startup_memlist.
1509          */
1510         if (&set_platform_defaults)
1511                 set_platform_defaults();
1512 
1513         /*
1514          * Calculate default settings of system parameters based upon
1515          * maxusers, yet allow to be overridden via the /etc/system file.
1516          */
1517         param_calc(0);
1518 
1519         mod_setup();
1520 
1521         /*
1522          * If this is a positron, complain and halt.
1523          */
1524         if (&iam_positron && iam_positron()) {
1525                 cmn_err(CE_WARN, "This hardware platform is not supported"
1526                     " by this release of Solaris.\n");
1527 #ifdef DEBUG
1528                 prom_enter_mon();       /* Type 'go' to resume */
1529                 cmn_err(CE_WARN, "Booting an unsupported platform.\n");
1530                 cmn_err(CE_WARN, "Booting with down-rev firmware.\n");
1531 
1532 #else /* DEBUG */
1533                 halt(0);
1534 #endif /* DEBUG */
1535         }
1536 
1537         /*
1538          * If we are running firmware that isn't 64-bit ready
1539          * then complain and halt.
1540          */
1541         do_prom_version_check();
1542 
1543         /*
1544          * Initialize system parameters
1545          */
1546         param_init();
1547 
1548         /*
1549          * maxmem is the amount of physical memory we're playing with.
1550          */
1551         maxmem = physmem;
1552 
1553         /* Set segkp limits. */
1554         ncbase = kdi_segdebugbase;
1555         ncend = kdi_segdebugbase;
1556 
1557         /*
1558          * Initialize the hat layer.
1559          */
1560         hat_init();
1561 
1562         /*
1563          * Initialize segment management stuff.
1564          */
1565         seg_init();
1566 
1567         /*
1568          * Create the va>tte handler, so the prom can understand
1569          * kernel translations.  The handler is installed later, just
1570          * as we are about to take over the trap table from the prom.
1571          */
1572         create_va_to_tte();
1573 
1574         /*
1575          * Load the forthdebugger (optional)
1576          */
1577         forthdebug_init();
1578 
1579         /*
1580          * Create OBP node for console input callbacks
1581          * if it is needed.
1582          */
1583         startup_create_io_node();
1584 
1585         if (modloadonly("fs", "specfs") == -1)
1586                 halt("Can't load specfs");
1587 
1588         if (modloadonly("fs", "devfs") == -1)
1589                 halt("Can't load devfs");
1590 
1591         if (modloadonly("fs", "procfs") == -1)
1592                 halt("Can't load procfs");
1593 
1594         if (modloadonly("misc", "swapgeneric") == -1)
1595                 halt("Can't load swapgeneric");
1596 
1597         (void) modloadonly("sys", "lbl_edition");
1598 
1599         dispinit();
1600 
1601         /*
1602          * Infer meanings to the members of the idprom buffer.
1603          */
1604         parse_idprom();
1605 
1606         /* Read cluster configuration data. */
1607         clconf_init();
1608 
1609         setup_ddi();
1610 
1611         /*
1612          * Lets take this opportunity to load the root device.
1613          */
1614         if (loadrootmodules() != 0)
1615                 debug_enter("Can't load the root filesystem");
1616 
1617         /*
1618          * Load tod driver module for the tod part found on this system.
1619          * Recompute the cpu frequency/delays based on tod as tod part
1620          * tends to keep time more accurately.
1621          */
1622         if (&load_tod_module)
1623                 load_tod_module();
1624 
1625         /*
1626          * Allow platforms to load modules which might
1627          * be needed after bootops are gone.
1628          */
1629         if (&load_platform_modules)
1630                 load_platform_modules();
1631 
1632         setcpudelay();
1633 
1634         copy_boot_memlists(&boot_physinstalled, &boot_physinstalled_len,
1635             &boot_physavail, &boot_physavail_len,
1636             &boot_virtavail, &boot_virtavail_len);
1637 
1638         /*
1639          * Calculation and allocation of hmeblks needed to remap
1640          * the memory allocated by PROM till now.
1641          * Overestimate the number of hblk1 elements by assuming
1642          * worst case of TTE64K mappings.
1643          * sfmmu_hblk_alloc will panic if this calculation is wrong.
1644          */
1645         bop_alloc_pages = btopr(kmem64_end - kmem64_base);
1646         pages_per_hblk = btop(HMEBLK_SPAN(TTE64K));
1647         bop_alloc_pages = roundup(bop_alloc_pages, pages_per_hblk);
1648         nhblk1 = bop_alloc_pages / pages_per_hblk + hblk1_min;
1649 
1650         bop_alloc_pages = size_virtalloc(boot_virtavail, boot_virtavail_len);
1651 
1652         /* sfmmu_init_nucleus_hblks expects properly aligned data structures */
1653         hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t));
1654         hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t));
1655 
1656         bop_alloc_pages += btopr(nhblk1 * hme1blk_sz);
1657 
1658         pages_per_hblk = btop(HMEBLK_SPAN(TTE8K));
1659         nhblk8 = 0;
1660         while (bop_alloc_pages > 1) {
1661                 bop_alloc_pages = roundup(bop_alloc_pages, pages_per_hblk);
1662                 nhblk8 += bop_alloc_pages /= pages_per_hblk;
1663                 bop_alloc_pages *= hme8blk_sz;
1664                 bop_alloc_pages = btopr(bop_alloc_pages);
1665         }
1666         nhblk8 += 2;
1667 
1668         /*
1669          * Since hblk8's can hold up to 64k of mappings aligned on a 64k
1670          * boundary, the number of hblk8's needed to map the entries in the
1671          * boot_virtavail list needs to be adjusted to take this into
1672          * consideration.  Thus, we need to add additional hblk8's since it
1673          * is possible that an hblk8 will not have all 8 slots used due to
1674          * alignment constraints.  Since there were boot_virtavail_len entries
1675          * in that list, we need to add that many hblk8's to the number
1676          * already calculated to make sure we don't underestimate.
1677          */
1678         nhblk8 += boot_virtavail_len;
1679         nhblksz = nhblk8 * hme8blk_sz + nhblk1 * hme1blk_sz;
1680 
1681         /* Allocate in pagesize chunks */
1682         nhblksz = roundup(nhblksz, MMU_PAGESIZE);
1683         hblk_base = kmem_zalloc(nhblksz, KM_SLEEP);
1684         sfmmu_init_nucleus_hblks(hblk_base, nhblksz, nhblk8, nhblk1);
1685 }
1686 
1687 static void
1688 startup_bop_gone(void)
1689 {
1690 
1691         /*
1692          * Destroy the MD initialized at startup
1693          * The startup initializes the MD framework
1694          * using prom and BOP alloc free it now.
1695          */
1696         mach_descrip_startup_fini();
1697 
1698         /*
1699          * We're done with prom allocations.
1700          */
1701         bop_fini();
1702 
1703         copy_boot_memlists(&boot_physinstalled, &boot_physinstalled_len,
1704             &boot_physavail, &boot_physavail_len,
1705             &boot_virtavail, &boot_virtavail_len);
1706 
1707         /*
1708          * setup physically contiguous area twice as large as the ecache.
1709          * this is used while doing displacement flush of ecaches
1710          */
1711         if (&ecache_flush_address) {
1712                 ecache_flushaddr = ecache_flush_address();
1713                 if (ecache_flushaddr == (uint64_t)-1) {
1714                         cmn_err(CE_PANIC,
1715                             "startup: no memory to set ecache_flushaddr");
1716                 }
1717         }
1718 
1719         /*
1720          * Virtual available next.
1721          */
1722         ASSERT(virt_avail != NULL);
1723         memlist_free_list(virt_avail);
1724         virt_avail = memlist;
1725         copy_memlist(boot_virtavail, boot_virtavail_len, &memlist);
1726 
1727 }
1728 
1729 
1730 /*
1731  * startup_fixup_physavail - called from mach_sfmmu.c after the final
1732  * allocations have been performed.  We can't call it in startup_bop_gone
1733  * since later operations can cause obp to allocate more memory.
1734  */
1735 void
1736 startup_fixup_physavail(void)
1737 {
1738         struct memlist *cur;
1739         size_t kmem64_overmap_size = kmem64_aligned_end - kmem64_end;
1740 
1741         PRM_DEBUG(kmem64_overmap_size);
1742 
1743         /*
1744          * take the most current snapshot we can by calling mem-update
1745          */
1746         copy_boot_memlists(&boot_physinstalled, &boot_physinstalled_len,
1747             &boot_physavail, &boot_physavail_len,
1748             &boot_virtavail, &boot_virtavail_len);
1749 
1750         /*
1751          * Copy phys_avail list, again.
1752          * Both the kernel/boot and the prom have been allocating
1753          * from the original list we copied earlier.
1754          */
1755         cur = memlist;
1756         copy_memlist(boot_physavail, boot_physavail_len, &memlist);
1757 
1758         /*
1759          * Add any unused kmem64 memory from overmapped page
1760          * (Note: va_to_pa does not work for kmem64_end)
1761          */
1762         if (kmem64_overmap_size) {
1763                 memlist_add(kmem64_pabase + (kmem64_end - kmem64_base),
1764                     kmem64_overmap_size, &memlist, &cur);
1765         }
1766 
1767         /*
1768          * Add any extra memory after e_data we added to the phys_avail list
1769          * back to the old list.
1770          */
1771         if (ndata_remain_sz >= MMU_PAGESIZE)
1772                 memlist_add(va_to_pa(nalloc_base),
1773                     (uint64_t)ndata_remain_sz, &memlist, &cur);
1774 
1775         /*
1776          * There isn't any bounds checking on the memlist area
1777          * so ensure it hasn't overgrown.
1778          */
1779         if ((caddr_t)memlist > (caddr_t)memlist_end)
1780                 cmn_err(CE_PANIC, "startup: memlist size exceeded");
1781 
1782         /*
1783          * The kernel removes the pages that were allocated for it from
1784          * the freelist, but we now have to find any -extra- pages that
1785          * the prom has allocated for it's own book-keeping, and remove
1786          * them from the freelist too. sigh.
1787          */
1788         sync_memlists(phys_avail, cur);
1789 
1790         ASSERT(phys_avail != NULL);
1791 
1792         old_phys_avail = phys_avail;
1793         phys_avail = cur;
1794 }
1795 
1796 void
1797 update_kcage_ranges(uint64_t addr, uint64_t len)
1798 {
1799         pfn_t base = btop(addr);
1800         pgcnt_t num = btop(len);
1801         int rv;
1802 
1803         rv = kcage_range_add(base, num, kcage_startup_dir);
1804 
1805         if (rv == ENOMEM) {
1806                 cmn_err(CE_WARN, "%ld megabytes not available to kernel cage",
1807                     (len == 0 ? 0 : BYTES_TO_MB(len)));
1808         } else if (rv != 0) {
1809                 /* catch this in debug kernels */
1810                 ASSERT(0);
1811 
1812                 cmn_err(CE_WARN, "unexpected kcage_range_add"
1813                     " return value %d", rv);
1814         }
1815 }
1816 
1817 static void
1818 startup_vm(void)
1819 {
1820         size_t  i;
1821         struct segmap_crargs a;
1822         struct segkpm_crargs b;
1823 
1824         uint64_t avmem;
1825         caddr_t va;
1826         pgcnt_t max_phys_segkp;
1827         int     mnode;
1828 
1829         extern int use_brk_lpg, use_stk_lpg;
1830 
1831         /*
1832          * get prom's mappings, create hments for them and switch
1833          * to the kernel context.
1834          */
1835         hat_kern_setup();
1836 
1837         /*
1838          * Take over trap table
1839          */
1840         setup_trap_table();
1841 
1842         /*
1843          * Install the va>tte handler, so that the prom can handle
1844          * misses and understand the kernel table layout in case
1845          * we need call into the prom.
1846          */
1847         install_va_to_tte();
1848 
1849         /*
1850          * Set a flag to indicate that the tba has been taken over.
1851          */
1852         tba_taken_over = 1;
1853 
1854         /* initialize MMU primary context register */
1855         mmu_init_kcontext();
1856 
1857         /*
1858          * The boot cpu can now take interrupts, x-calls, x-traps
1859          */
1860         CPUSET_ADD(cpu_ready_set, CPU->cpu_id);
1861         CPU->cpu_flags |= (CPU_READY | CPU_ENABLE | CPU_EXISTS);
1862 
1863         /*
1864          * Set a flag to tell write_scb_int() that it can access V_TBR_WR_ADDR.
1865          */
1866         tbr_wr_addr_inited = 1;
1867 
1868         /*
1869          * Initialize VM system, and map kernel address space.
1870          */
1871         kvm_init();
1872 
1873         ASSERT(old_phys_avail != NULL && phys_avail != NULL);
1874         if (kernel_cage_enable) {
1875                 diff_memlists(phys_avail, old_phys_avail, update_kcage_ranges);
1876         }
1877         memlist_free_list(old_phys_avail);
1878 
1879         /*
1880          * If the following is true, someone has patched
1881          * phsymem to be less than the number of pages that
1882          * the system actually has.  Remove pages until system
1883          * memory is limited to the requested amount.  Since we
1884          * have allocated page structures for all pages, we
1885          * correct the amount of memory we want to remove
1886          * by the size of the memory used to hold page structures
1887          * for the non-used pages.
1888          */
1889         if (physmem + ramdisk_npages < npages) {
1890                 pgcnt_t diff, off;
1891                 struct page *pp;
1892                 struct seg kseg;
1893 
1894                 cmn_err(CE_WARN, "limiting physmem to %ld pages", physmem);
1895 
1896                 off = 0;
1897                 diff = npages - (physmem + ramdisk_npages);
1898                 diff -= mmu_btopr(diff * sizeof (struct page));
1899                 kseg.s_as = &kas;
1900                 while (diff--) {
1901                         pp = page_create_va(&unused_pages_vp, (offset_t)off,
1902                             MMU_PAGESIZE, PG_WAIT | PG_EXCL,
1903                             &kseg, (caddr_t)off);
1904                         if (pp == NULL)
1905                                 cmn_err(CE_PANIC, "limited physmem too much!");
1906                         page_io_unlock(pp);
1907                         page_downgrade(pp);
1908                         availrmem--;
1909                         off += MMU_PAGESIZE;
1910                 }
1911         }
1912 
1913         /*
1914          * When printing memory, show the total as physmem less
1915          * that stolen by a debugger.
1916          */
1917         cmn_err(CE_CONT, "?mem = %ldK (0x%lx000)\n",
1918             (ulong_t)(physinstalled) << (PAGESHIFT - 10),
1919             (ulong_t)(physinstalled) << (PAGESHIFT - 12));
1920 
1921         avmem = (uint64_t)freemem << PAGESHIFT;
1922         cmn_err(CE_CONT, "?avail mem = %lld\n", (unsigned long long)avmem);
1923 
1924         /*
1925          * For small memory systems disable automatic large pages.
1926          */
1927         if (physmem < privm_lpg_min_physmem) {
1928                 use_brk_lpg = 0;
1929                 use_stk_lpg = 0;
1930         }
1931 
1932         /*
1933          * Perform platform specific freelist processing
1934          */
1935         if (&plat_freelist_process) {
1936                 for (mnode = 0; mnode < max_mem_nodes; mnode++)
1937                         if (mem_node_config[mnode].exists)
1938                                 plat_freelist_process(mnode);
1939         }
1940 
1941         /*
1942          * Initialize the segkp segment type.  We position it
1943          * after the configured tables and buffers (whose end
1944          * is given by econtig) and before V_WKBASE_ADDR.
1945          * Also in this area is segkmap (size SEGMAPSIZE).
1946          */
1947 
1948         /* XXX - cache alignment? */
1949         va = (caddr_t)SEGKPBASE;
1950         ASSERT(((uintptr_t)va & PAGEOFFSET) == 0);
1951 
1952         max_phys_segkp = (physmem * 2);
1953 
1954         if (segkpsize < btop(SEGKPMINSIZE) || segkpsize > btop(SEGKPMAXSIZE)) {
1955                 segkpsize = btop(SEGKPDEFSIZE);
1956                 cmn_err(CE_WARN, "Illegal value for segkpsize. "
1957                     "segkpsize has been reset to %ld pages", segkpsize);
1958         }
1959 
1960         i = ptob(MIN(segkpsize, max_phys_segkp));
1961 
1962         rw_enter(&kas.a_lock, RW_WRITER);
1963         if (seg_attach(&kas, va, i, segkp) < 0)
1964                 cmn_err(CE_PANIC, "startup: cannot attach segkp");
1965         if (segkp_create(segkp) != 0)
1966                 cmn_err(CE_PANIC, "startup: segkp_create failed");
1967         rw_exit(&kas.a_lock);
1968 
1969         /*
1970          * kpm segment
1971          */
1972         segmap_kpm = kpm_enable &&
1973             segmap_kpm && PAGESIZE == MAXBSIZE;
1974 
1975         if (kpm_enable) {
1976                 rw_enter(&kas.a_lock, RW_WRITER);
1977 
1978                 /*
1979                  * The segkpm virtual range range is larger than the
1980                  * actual physical memory size and also covers gaps in
1981                  * the physical address range for the following reasons:
1982                  * . keep conversion between segkpm and physical addresses
1983                  *   simple, cheap and unambiguous.
1984                  * . avoid extension/shrink of the the segkpm in case of DR.
1985                  * . avoid complexity for handling of virtual addressed
1986                  *   caches, segkpm and the regular mapping scheme must be
1987                  *   kept in sync wrt. the virtual color of mapped pages.
1988                  * Any accesses to virtual segkpm ranges not backed by
1989                  * physical memory will fall through the memseg pfn hash
1990                  * and will be handled in segkpm_fault.
1991                  * Additional kpm_size spaces needed for vac alias prevention.
1992                  */
1993                 if (seg_attach(&kas, kpm_vbase, kpm_size * vac_colors,
1994                     segkpm) < 0)
1995                         cmn_err(CE_PANIC, "cannot attach segkpm");
1996 
1997                 b.prot = PROT_READ | PROT_WRITE;
1998                 b.nvcolors = shm_alignment >> MMU_PAGESHIFT;
1999 
2000                 if (segkpm_create(segkpm, (caddr_t)&b) != 0)
2001                         panic("segkpm_create segkpm");
2002 
2003                 rw_exit(&kas.a_lock);
2004 
2005                 mach_kpm_init();
2006         }
2007 
2008         va = kpm_vbase + (kpm_size * vac_colors);
2009 
2010         if (!segzio_fromheap) {
2011                 size_t size;
2012                 size_t physmem_b = mmu_ptob(physmem);
2013 
2014                 /* size is in bytes, segziosize is in pages */
2015                 if (segziosize == 0) {
2016                         size = physmem_b;
2017                 } else {
2018                         size = mmu_ptob(segziosize);
2019                 }
2020 
2021                 if (size < SEGZIOMINSIZE) {
2022                         size = SEGZIOMINSIZE;
2023                 } else if (size > SEGZIOMAXSIZE) {
2024                         size = SEGZIOMAXSIZE;
2025                         /*
2026                          * On 64-bit x86, we only have 2TB of KVA.  This exists
2027                          * for parity with x86.
2028                          *
2029                          * SEGZIOMAXSIZE is capped at 512gb so that segzio
2030                          * doesn't consume all of KVA.  However, if we have a
2031                          * system that has more thant 512gb of physical memory,
2032                          * we can actually consume about half of the difference
2033                          * between 512gb and the rest of the available physical
2034                          * memory.
2035                          */
2036                         if (physmem_b > SEGZIOMAXSIZE) {
2037                                 size += (physmem_b - SEGZIOMAXSIZE) / 2;
2038                 }
2039                 }
2040                 segziosize = mmu_btop(roundup(size, MMU_PAGESIZE));
2041                 /* put the base of the ZIO segment after the kpm segment */
2042                 segzio_base = va;
2043                 va += mmu_ptob(segziosize);
2044                 PRM_DEBUG(segziosize);
2045                 PRM_DEBUG(segzio_base);
2046 
2047                 /*
2048                  * On some platforms, kvm_init is called after the kpm
2049                  * sizes have been determined.  On SPARC, kvm_init is called
2050                  * before, so we have to attach the kzioseg after kvm is
2051                  * initialized, otherwise we'll try to allocate from the boot
2052                  * area since the kernel heap hasn't yet been configured.
2053                  */
2054                 rw_enter(&kas.a_lock, RW_WRITER);
2055 
2056                 (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
2057                     &kzioseg);
2058                 (void) segkmem_zio_create(&kzioseg);
2059 
2060                 /* create zio area covering new segment */
2061                 segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
2062 
2063                 rw_exit(&kas.a_lock);
2064         }
2065 
2066         if (ppvm_enable) {
2067                 caddr_t ppvm_max;
2068 
2069                 /*
2070                  * ppvm refers to the static VA space used to map
2071                  * the page_t's for dynamically added memory.
2072                  *
2073                  * ppvm_base should not cross a potential VA hole.
2074                  *
2075                  * ppvm_size should be large enough to map the
2076                  * page_t's needed to manage all of KPM range.
2077                  */
2078                 ppvm_size =
2079                     roundup(mmu_btop(kpm_size * vac_colors) * sizeof (page_t),
2080                     MMU_PAGESIZE);
2081                 ppvm_max = (caddr_t)(0ull - ppvm_size);
2082                 ppvm_base = (page_t *)va;
2083 
2084                 if ((caddr_t)ppvm_base <= hole_end) {
2085                         cmn_err(CE_WARN,
2086                             "Memory DR disabled: invalid DR map base: 0x%p\n",
2087                             (void *)ppvm_base);
2088                         ppvm_enable = 0;
2089                 } else if ((caddr_t)ppvm_base > ppvm_max) {
2090                         uint64_t diff = (caddr_t)ppvm_base - ppvm_max;
2091 
2092                         cmn_err(CE_WARN,
2093                             "Memory DR disabled: insufficient DR map size:"
2094                             " 0x%lx (needed 0x%lx)\n",
2095                             ppvm_size - diff, ppvm_size);
2096                         ppvm_enable = 0;
2097                 }
2098                 PRM_DEBUG(ppvm_size);
2099                 PRM_DEBUG(ppvm_base);
2100         }
2101 
2102         /*
2103          * Now create generic mapping segment.  This mapping
2104          * goes SEGMAPSIZE beyond SEGMAPBASE.  But if the total
2105          * virtual address is greater than the amount of free
2106          * memory that is available, then we trim back the
2107          * segment size to that amount
2108          */
2109         va = (caddr_t)SEGMAPBASE;
2110 
2111         /*
2112          * 1201049: segkmap base address must be MAXBSIZE aligned
2113          */
2114         ASSERT(((uintptr_t)va & MAXBOFFSET) == 0);
2115 
2116         /*
2117          * Set size of segmap to percentage of freemem at boot,
2118          * but stay within the allowable range
2119          * Note we take percentage  before converting from pages
2120          * to bytes to avoid an overflow on 32-bit kernels.
2121          */
2122         i = mmu_ptob((freemem * segmap_percent) / 100);
2123 
2124         if (i < MINMAPSIZE)
2125                 i = MINMAPSIZE;
2126 
2127         if (i > MIN(SEGMAPSIZE, mmu_ptob(freemem)))
2128                 i = MIN(SEGMAPSIZE, mmu_ptob(freemem));
2129 
2130         i &= MAXBMASK;      /* 1201049: segkmap size must be MAXBSIZE aligned */
2131 
2132         rw_enter(&kas.a_lock, RW_WRITER);
2133         if (seg_attach(&kas, va, i, segkmap) < 0)
2134                 cmn_err(CE_PANIC, "cannot attach segkmap");
2135 
2136         a.prot = PROT_READ | PROT_WRITE;
2137         a.shmsize = shm_alignment;
2138         a.nfreelist = 0;        /* use segmap driver defaults */
2139 
2140         if (segmap_create(segkmap, (caddr_t)&a) != 0)
2141                 panic("segmap_create segkmap");
2142         rw_exit(&kas.a_lock);
2143 
2144         segdev_init();
2145 }
2146 
2147 static void
2148 startup_end(void)
2149 {
2150         if ((caddr_t)memlist > (caddr_t)memlist_end)
2151                 panic("memlist overflow 2");
2152         memlist_free_block((caddr_t)memlist,
2153             ((caddr_t)memlist_end - (caddr_t)memlist));
2154         memlist = NULL;
2155 
2156         /* enable page_relocation since OBP is now done */
2157         page_relocate_ready = 1;
2158 
2159         /*
2160          * Perform tasks that get done after most of the VM
2161          * initialization has been done but before the clock
2162          * and other devices get started.
2163          */
2164         kern_setup1();
2165 
2166         /*
2167          * Perform CPC initialization for this CPU.
2168          */
2169         kcpc_hw_init();
2170 
2171         /*
2172          * Intialize the VM arenas for allocating physically
2173          * contiguus memory chunk for interrupt queues snd
2174          * allocate/register boot cpu's queues, if any and
2175          * allocate dump buffer for sun4v systems to store
2176          * extra crash information during crash dump
2177          */
2178         contig_mem_init();
2179         mach_descrip_init();
2180 
2181         if (cpu_intrq_setup(CPU)) {
2182                 cmn_err(CE_PANIC, "cpu%d: setup failed", CPU->cpu_id);
2183         }
2184         cpu_intrq_register(CPU);
2185         mach_htraptrace_setup(CPU->cpu_id);
2186         mach_htraptrace_configure(CPU->cpu_id);
2187         mach_dump_buffer_init();
2188 
2189         /*
2190          * Initialize interrupt related stuff
2191          */
2192         cpu_intr_alloc(CPU, NINTR_THREADS);
2193 
2194         (void) splzs();                 /* allow hi clock ints but not zs */
2195 
2196         /*
2197          * Initialize errors.
2198          */
2199         error_init();
2200 
2201         /*
2202          * Note that we may have already used kernel bcopy before this
2203          * point - but if you really care about this, adb the use_hw_*
2204          * variables to 0 before rebooting.
2205          */
2206         mach_hw_copy_limit();
2207 
2208         /*
2209          * Install the "real" preemption guards before DDI services
2210          * are available.
2211          */
2212         (void) prom_set_preprom(kern_preprom);
2213         (void) prom_set_postprom(kern_postprom);
2214         CPU->cpu_m.mutex_ready = 1;
2215 
2216         /*
2217          * Initialize segnf (kernel support for non-faulting loads).
2218          */
2219         segnf_init();
2220 
2221         /*
2222          * Configure the root devinfo node.
2223          */
2224         configure();            /* set up devices */
2225         mach_cpu_halt_idle();
2226 }
2227 
2228 
2229 void
2230 post_startup(void)
2231 {
2232 #ifdef  PTL1_PANIC_DEBUG
2233         extern void init_ptl1_thread(void);
2234 #endif  /* PTL1_PANIC_DEBUG */
2235         extern void abort_sequence_init(void);
2236 
2237         /*
2238          * Set the system wide, processor-specific flags to be passed
2239          * to userland via the aux vector for performance hints and
2240          * instruction set extensions.
2241          */
2242         bind_hwcap();
2243 
2244         /*
2245          * Startup memory scrubber (if any)
2246          */
2247         mach_memscrub();
2248 
2249         /*
2250          * Allocate soft interrupt to handle abort sequence.
2251          */
2252         abort_sequence_init();
2253 
2254         /*
2255          * Configure the rest of the system.
2256          * Perform forceloading tasks for /etc/system.
2257          */
2258         (void) mod_sysctl(SYS_FORCELOAD, NULL);
2259         /*
2260          * ON4.0: Force /proc module in until clock interrupt handle fixed
2261          * ON4.0: This must be fixed or restated in /etc/systems.
2262          */
2263         (void) modload("fs", "procfs");
2264 
2265         /* load machine class specific drivers */
2266         load_mach_drivers();
2267 
2268         /* load platform specific drivers */
2269         if (&load_platform_drivers)
2270                 load_platform_drivers();
2271 
2272         /* load vis simulation module, if we are running w/fpu off */
2273         if (!fpu_exists) {
2274                 if (modload("misc", "vis") == -1)
2275                         halt("Can't load vis");
2276         }
2277 
2278         mach_fpras();
2279 
2280         maxmem = freemem;
2281 
2282         pg_init();
2283 
2284 #ifdef  PTL1_PANIC_DEBUG
2285         init_ptl1_thread();
2286 #endif  /* PTL1_PANIC_DEBUG */
2287 }
2288 
2289 #ifdef  PTL1_PANIC_DEBUG
2290 int             ptl1_panic_test = 0;
2291 int             ptl1_panic_xc_one_test = 0;
2292 int             ptl1_panic_xc_all_test = 0;
2293 int             ptl1_panic_xt_one_test = 0;
2294 int             ptl1_panic_xt_all_test = 0;
2295 kthread_id_t    ptl1_thread_p = NULL;
2296 kcondvar_t      ptl1_cv;
2297 kmutex_t        ptl1_mutex;
2298 int             ptl1_recurse_count_threshold = 0x40;
2299 int             ptl1_recurse_trap_threshold = 0x3d;
2300 extern void     ptl1_recurse(int, int);
2301 extern void     ptl1_panic_xt(int, int);
2302 
2303 /*
2304  * Called once per second by timeout() to wake up
2305  * the ptl1_panic thread to see if it should cause
2306  * a trap to the ptl1_panic() code.
2307  */
2308 /* ARGSUSED */
2309 static void
2310 ptl1_wakeup(void *arg)
2311 {
2312         mutex_enter(&ptl1_mutex);
2313         cv_signal(&ptl1_cv);
2314         mutex_exit(&ptl1_mutex);
2315 }
2316 
2317 /*
2318  * ptl1_panic cross call function:
2319  *     Needed because xc_one() and xc_some() can pass
2320  *      64 bit args but ptl1_recurse() expects ints.
2321  */
2322 static void
2323 ptl1_panic_xc(void)
2324 {
2325         ptl1_recurse(ptl1_recurse_count_threshold,
2326             ptl1_recurse_trap_threshold);
2327 }
2328 
2329 /*
2330  * The ptl1 thread waits for a global flag to be set
2331  * and uses the recurse thresholds to set the stack depth
2332  * to cause a ptl1_panic() directly via a call to ptl1_recurse
2333  * or indirectly via the cross call and cross trap functions.
2334  *
2335  * This is useful testing stack overflows and normal
2336  * ptl1_panic() states with a know stack frame.
2337  *
2338  * ptl1_recurse() is an asm function in ptl1_panic.s that
2339  * sets the {In, Local, Out, and Global} registers to a
2340  * know state on the stack and just prior to causing a
2341  * test ptl1_panic trap.
2342  */
2343 static void
2344 ptl1_thread(void)
2345 {
2346         mutex_enter(&ptl1_mutex);
2347         while (ptl1_thread_p) {
2348                 cpuset_t        other_cpus;
2349                 int             cpu_id;
2350                 int             my_cpu_id;
2351                 int             target_cpu_id;
2352                 int             target_found;
2353 
2354                 if (ptl1_panic_test) {
2355                         ptl1_recurse(ptl1_recurse_count_threshold,
2356                             ptl1_recurse_trap_threshold);
2357                 }
2358 
2359                 /*
2360                  * Find potential targets for x-call and x-trap,
2361                  * if any exist while preempt is disabled we
2362                  * start a ptl1_panic if requested via a
2363                  * globals.
2364                  */
2365                 kpreempt_disable();
2366                 my_cpu_id = CPU->cpu_id;
2367                 other_cpus = cpu_ready_set;
2368                 CPUSET_DEL(other_cpus, CPU->cpu_id);
2369                 target_found = 0;
2370                 if (!CPUSET_ISNULL(other_cpus)) {
2371                         /*
2372                          * Pick the first one
2373                          */
2374                         for (cpu_id = 0; cpu_id < NCPU; cpu_id++) {
2375                                 if (cpu_id == my_cpu_id)
2376                                         continue;
2377 
2378                                 if (CPU_XCALL_READY(cpu_id)) {
2379                                         target_cpu_id = cpu_id;
2380                                         target_found = 1;
2381                                         break;
2382                                 }
2383                         }
2384                         ASSERT(target_found);
2385 
2386                         if (ptl1_panic_xc_one_test) {
2387                                 xc_one(target_cpu_id,
2388                                     (xcfunc_t *)ptl1_panic_xc, 0, 0);
2389                         }
2390                         if (ptl1_panic_xc_all_test) {
2391                                 xc_some(other_cpus,
2392                                     (xcfunc_t *)ptl1_panic_xc, 0, 0);
2393                         }
2394                         if (ptl1_panic_xt_one_test) {
2395                                 xt_one(target_cpu_id,
2396                                     (xcfunc_t *)ptl1_panic_xt, 0, 0);
2397                         }
2398                         if (ptl1_panic_xt_all_test) {
2399                                 xt_some(other_cpus,
2400                                     (xcfunc_t *)ptl1_panic_xt, 0, 0);
2401                         }
2402                 }
2403                 kpreempt_enable();
2404                 (void) timeout(ptl1_wakeup, NULL, hz);
2405                 (void) cv_wait(&ptl1_cv, &ptl1_mutex);
2406         }
2407         mutex_exit(&ptl1_mutex);
2408 }
2409 
2410 /*
2411  * Called during early startup to create the ptl1_thread
2412  */
2413 void
2414 init_ptl1_thread(void)
2415 {
2416         ptl1_thread_p = thread_create(NULL, 0, ptl1_thread, NULL, 0,
2417             &p0, TS_RUN, 0);
2418 }
2419 #endif  /* PTL1_PANIC_DEBUG */
2420 
2421 
2422 static void
2423 memlist_new(uint64_t start, uint64_t len, struct memlist **memlistp)
2424 {
2425         struct memlist *new;
2426 
2427         new = *memlistp;
2428         new->ml_address = start;
2429         new->ml_size = len;
2430         *memlistp = new + 1;
2431 }
2432 
2433 /*
2434  * Add to a memory list.
2435  * start = start of new memory segment
2436  * len = length of new memory segment in bytes
2437  * memlistp = pointer to array of available memory segment structures
2438  * curmemlistp = memory list to which to add segment.
2439  */
2440 static void
2441 memlist_add(uint64_t start, uint64_t len, struct memlist **memlistp,
2442     struct memlist **curmemlistp)
2443 {
2444         struct memlist *new = *memlistp;
2445 
2446         memlist_new(start, len, memlistp);
2447         memlist_insert(new, curmemlistp);
2448 }
2449 
2450 static int
2451 ndata_alloc_memseg(struct memlist *ndata, size_t avail)
2452 {
2453         int nseg;
2454         size_t memseg_sz;
2455         struct memseg *msp;
2456 
2457         /*
2458          * The memseg list is for the chunks of physical memory that
2459          * will be managed by the vm system.  The number calculated is
2460          * a guess as boot may fragment it more when memory allocations
2461          * are made before kphysm_init().
2462          */
2463         memseg_sz = (avail + 10) * sizeof (struct memseg);
2464         memseg_sz = roundup(memseg_sz, PAGESIZE);
2465         nseg = memseg_sz / sizeof (struct memseg);
2466         msp = ndata_alloc(ndata, memseg_sz, ecache_alignsize);
2467         if (msp == NULL)
2468                 return (1);
2469         PRM_DEBUG(memseg_free);
2470 
2471         while (nseg--) {
2472                 msp->next = memseg_free;
2473                 memseg_free = msp;
2474                 msp++;
2475         }
2476         return (0);
2477 }
2478 
2479 /*
2480  * In the case of architectures that support dynamic addition of
2481  * memory at run-time there are two cases where memsegs need to
2482  * be initialized and added to the memseg list.
2483  * 1) memsegs that are constructed at startup.
2484  * 2) memsegs that are constructed at run-time on
2485  *    hot-plug capable architectures.
2486  * This code was originally part of the function kphysm_init().
2487  */
2488 
2489 static void
2490 memseg_list_add(struct memseg *memsegp)
2491 {
2492         struct memseg **prev_memsegp;
2493         pgcnt_t num;
2494 
2495         /* insert in memseg list, decreasing number of pages order */
2496 
2497         num = MSEG_NPAGES(memsegp);
2498 
2499         for (prev_memsegp = &memsegs; *prev_memsegp;
2500             prev_memsegp = &((*prev_memsegp)->next)) {
2501                 if (num > MSEG_NPAGES(*prev_memsegp))
2502                         break;
2503         }
2504 
2505         memsegp->next = *prev_memsegp;
2506         *prev_memsegp = memsegp;
2507 
2508         if (kpm_enable) {
2509                 memsegp->nextpa = (memsegp->next) ?
2510                     va_to_pa(memsegp->next) : MSEG_NULLPTR_PA;
2511 
2512                 if (prev_memsegp != &memsegs) {
2513                         struct memseg *msp;
2514                         msp = (struct memseg *)((caddr_t)prev_memsegp -
2515                             offsetof(struct memseg, next));
2516                         msp->nextpa = va_to_pa(memsegp);
2517                 } else {
2518                         memsegspa = va_to_pa(memsegs);
2519                 }
2520         }
2521 }
2522 
2523 /*
2524  * PSM add_physmem_cb(). US-II and newer processors have some
2525  * flavor of the prefetch capability implemented. We exploit
2526  * this capability for optimum performance.
2527  */
2528 #define PREFETCH_BYTES  64
2529 
2530 void
2531 add_physmem_cb(page_t *pp, pfn_t pnum)
2532 {
2533         extern void      prefetch_page_w(void *);
2534 
2535         pp->p_pagenum = pnum;
2536 
2537         /*
2538          * Prefetch one more page_t into E$. To prevent future
2539          * mishaps with the sizeof(page_t) changing on us, we
2540          * catch this on debug kernels if we can't bring in the
2541          * entire hpage with 2 PREFETCH_BYTES reads. See
2542          * also, sun4u/cpu/cpu_module.c
2543          */
2544         /*LINTED*/
2545         ASSERT(sizeof (page_t) <= 2*PREFETCH_BYTES);
2546         prefetch_page_w((char *)pp);
2547 }
2548 
2549 /*
2550  * Find memseg with given pfn
2551  */
2552 static struct memseg *
2553 memseg_find(pfn_t base, pfn_t *next)
2554 {
2555         struct memseg *seg;
2556 
2557         if (next != NULL)
2558                 *next = LONG_MAX;
2559         for (seg = memsegs; seg != NULL; seg = seg->next) {
2560                 if (base >= seg->pages_base && base < seg->pages_end)
2561                         return (seg);
2562                 if (next != NULL && seg->pages_base > base &&
2563                     seg->pages_base < *next)
2564                         *next = seg->pages_base;
2565         }
2566         return (NULL);
2567 }
2568 
2569 /*
2570  * Put page allocated by OBP on prom_ppages
2571  */
2572 static void
2573 kphysm_erase(uint64_t addr, uint64_t len)
2574 {
2575         struct page *pp;
2576         struct memseg *seg;
2577         pfn_t base = btop(addr), next;
2578         pgcnt_t num = btop(len);
2579 
2580         while (num != 0) {
2581                 pgcnt_t off, left;
2582 
2583                 seg = memseg_find(base, &next);
2584                 if (seg == NULL) {
2585                         if (next == LONG_MAX)
2586                                 break;
2587                         left = MIN(next - base, num);
2588                         base += left, num -= left;
2589                         continue;
2590                 }
2591                 off = base - seg->pages_base;
2592                 pp = seg->pages + off;
2593                 left = num - MIN(num, (seg->pages_end - seg->pages_base) - off);
2594                 while (num != left) {
2595                         /*
2596                          * init it, lock it, and hashin on prom_pages vp.
2597                          *
2598                          * Mark it as NONRELOC to let DR know the page
2599                          * is locked long term, otherwise DR hangs when
2600                          * trying to remove those pages.
2601                          *
2602                          * XXX  vnode offsets on the prom_ppages vnode
2603                          *      are page numbers (gack) for >32 bit
2604                          *      physical memory machines.
2605                          */
2606                         PP_SETNORELOC(pp);
2607                         add_physmem_cb(pp, base);
2608                         if (page_trylock(pp, SE_EXCL) == 0)
2609                                 cmn_err(CE_PANIC, "prom page locked");
2610                         (void) page_hashin(pp, &promvp,
2611                             (offset_t)base, NULL);
2612                         (void) page_pp_lock(pp, 0, 1);
2613                         pp++, base++, num--;
2614                 }
2615         }
2616 }
2617 
2618 static page_t *ppnext;
2619 static pgcnt_t ppleft;
2620 
2621 static void *kpm_ppnext;
2622 static pgcnt_t kpm_ppleft;
2623 
2624 /*
2625  * Create a memseg
2626  */
2627 static void
2628 kphysm_memseg(uint64_t addr, uint64_t len)
2629 {
2630         pfn_t base = btop(addr);
2631         pgcnt_t num = btop(len);
2632         struct memseg *seg;
2633 
2634         seg = memseg_free;
2635         memseg_free = seg->next;
2636         ASSERT(seg != NULL);
2637 
2638         seg->pages = ppnext;
2639         seg->epages = ppnext + num;
2640         seg->pages_base = base;
2641         seg->pages_end = base + num;
2642         ppnext += num;
2643         ppleft -= num;
2644 
2645         if (kpm_enable) {
2646                 pgcnt_t kpnum = ptokpmpr(num);
2647 
2648                 if (kpnum > kpm_ppleft)
2649                         panic("kphysm_memseg: kpm_pp overflow");
2650                 seg->pagespa = va_to_pa(seg->pages);
2651                 seg->epagespa = va_to_pa(seg->epages);
2652                 seg->kpm_pbase = kpmptop(ptokpmp(base));
2653                 seg->kpm_nkpmpgs = kpnum;
2654                 /*
2655                  * In the kpm_smallpage case, the kpm array
2656                  * is 1-1 wrt the page array
2657                  */
2658                 if (kpm_smallpages) {
2659                         kpm_spage_t *kpm_pp = kpm_ppnext;
2660 
2661                         kpm_ppnext = kpm_pp + kpnum;
2662                         seg->kpm_spages = kpm_pp;
2663                         seg->kpm_pagespa = va_to_pa(seg->kpm_spages);
2664                 } else {
2665                         kpm_page_t *kpm_pp = kpm_ppnext;
2666 
2667                         kpm_ppnext = kpm_pp + kpnum;
2668                         seg->kpm_pages = kpm_pp;
2669                         seg->kpm_pagespa = va_to_pa(seg->kpm_pages);
2670                         /* ASSERT no kpm overlaps */
2671                         ASSERT(
2672                             memseg_find(base - pmodkpmp(base), NULL) == NULL);
2673                         ASSERT(memseg_find(
2674                             roundup(base + num, kpmpnpgs) - 1, NULL) == NULL);
2675                 }
2676                 kpm_ppleft -= kpnum;
2677         }
2678 
2679         memseg_list_add(seg);
2680 }
2681 
2682 /*
2683  * Add range to free list
2684  */
2685 void
2686 kphysm_add(uint64_t addr, uint64_t len, int reclaim)
2687 {
2688         struct page *pp;
2689         struct memseg *seg;
2690         pfn_t base = btop(addr);
2691         pgcnt_t num = btop(len);
2692 
2693         seg = memseg_find(base, NULL);
2694         ASSERT(seg != NULL);
2695         pp = seg->pages + (base - seg->pages_base);
2696 
2697         if (reclaim) {
2698                 struct page *rpp = pp;
2699                 struct page *lpp = pp + num;
2700 
2701                 /*
2702                  * page should be locked on prom_ppages
2703                  * unhash and unlock it
2704                  */
2705                 while (rpp < lpp) {
2706                         ASSERT(PAGE_EXCL(rpp) && rpp->p_vnode == &promvp);
2707                         ASSERT(PP_ISNORELOC(rpp));
2708                         PP_CLRNORELOC(rpp);
2709                         page_pp_unlock(rpp, 0, 1);
2710                         page_hashout(rpp, NULL);
2711                         page_unlock(rpp);
2712                         rpp++;
2713                 }
2714         }
2715 
2716         /*
2717          * add_physmem() initializes the PSM part of the page
2718          * struct by calling the PSM back with add_physmem_cb().
2719          * In addition it coalesces pages into larger pages as
2720          * it initializes them.
2721          */
2722         add_physmem(pp, num, base);
2723 }
2724 
2725 /*
2726  * kphysm_init() tackles the problem of initializing physical memory.
2727  */
2728 static void
2729 kphysm_init(void)
2730 {
2731         struct memlist *pmem;
2732 
2733         ASSERT(page_hash != NULL && page_hashsz != 0);
2734 
2735         ppnext = pp_base;
2736         ppleft = npages;
2737         kpm_ppnext = kpm_pp_base;
2738         kpm_ppleft = kpm_npages;
2739 
2740         /*
2741          * installed pages not on nopp_memlist go in memseg list
2742          */
2743         diff_memlists(phys_install, nopp_list, kphysm_memseg);
2744 
2745         /*
2746          * Free the avail list
2747          */
2748         for (pmem = phys_avail; pmem != NULL; pmem = pmem->ml_next)
2749                 kphysm_add(pmem->ml_address, pmem->ml_size, 0);
2750 
2751         /*
2752          * Erase pages that aren't available
2753          */
2754         diff_memlists(phys_install, phys_avail, kphysm_erase);
2755 
2756         build_pfn_hash();
2757 }
2758 
2759 /*
2760  * Kernel VM initialization.
2761  * Assumptions about kernel address space ordering:
2762  *      (1) gap (user space)
2763  *      (2) kernel text
2764  *      (3) kernel data/bss
2765  *      (4) gap
2766  *      (5) kernel data structures
2767  *      (6) gap
2768  *      (7) debugger (optional)
2769  *      (8) monitor
2770  *      (9) gap (possibly null)
2771  *      (10) dvma
2772  *      (11) devices
2773  */
2774 static void
2775 kvm_init(void)
2776 {
2777         /*
2778          * Put the kernel segments in kernel address space.
2779          */
2780         rw_enter(&kas.a_lock, RW_WRITER);
2781         as_avlinit(&kas);
2782 
2783         (void) seg_attach(&kas, (caddr_t)KERNELBASE,
2784             (size_t)(e_moddata - KERNELBASE), &ktextseg);
2785         (void) segkmem_create(&ktextseg);
2786 
2787         (void) seg_attach(&kas, (caddr_t)(KERNELBASE + MMU_PAGESIZE4M),
2788             (size_t)(MMU_PAGESIZE4M), &ktexthole);
2789         (void) segkmem_create(&ktexthole);
2790 
2791         (void) seg_attach(&kas, (caddr_t)valloc_base,
2792             (size_t)(econtig32 - valloc_base), &kvalloc);
2793         (void) segkmem_create(&kvalloc);
2794 
2795         if (kmem64_base) {
2796                 (void) seg_attach(&kas, (caddr_t)kmem64_base,
2797                     (size_t)(kmem64_end - kmem64_base), &kmem64);
2798                 (void) segkmem_create(&kmem64);
2799         }
2800 
2801         /*
2802          * We're about to map out /boot.  This is the beginning of the
2803          * system resource management transition. We can no longer
2804          * call into /boot for I/O or memory allocations.
2805          */
2806         (void) seg_attach(&kas, kernelheap, ekernelheap - kernelheap, &kvseg);
2807         (void) segkmem_create(&kvseg);
2808         hblk_alloc_dynamic = 1;
2809 
2810         /*
2811          * we need to preallocate pages for DR operations before enabling large
2812          * page kernel heap because of memseg_remap_init() hat_unload() hack.
2813          */
2814         memseg_remap_init();
2815 
2816         /* at this point we are ready to use large page heap */
2817         segkmem_heap_lp_init();
2818 
2819         (void) seg_attach(&kas, (caddr_t)SYSBASE32, SYSLIMIT32 - SYSBASE32,
2820             &kvseg32);
2821         (void) segkmem_create(&kvseg32);
2822 
2823         /*
2824          * Create a segment for the debugger.
2825          */
2826         (void) seg_attach(&kas, kdi_segdebugbase, kdi_segdebugsize, &kdebugseg);
2827         (void) segkmem_create(&kdebugseg);
2828 
2829         rw_exit(&kas.a_lock);
2830 }
2831 
2832 char obp_tte_str[] =
2833         "h# %x constant MMU_PAGESHIFT "
2834         "h# %x constant TTE8K "
2835         "h# %x constant SFHME_SIZE "
2836         "h# %x constant SFHME_TTE "
2837         "h# %x constant HMEBLK_TAG "
2838         "h# %x constant HMEBLK_NEXT "
2839         "h# %x constant HMEBLK_MISC "
2840         "h# %x constant HMEBLK_HME1 "
2841         "h# %x constant NHMENTS "
2842         "h# %x constant HBLK_SZMASK "
2843         "h# %x constant HBLK_RANGE_SHIFT "
2844         "h# %x constant HMEBP_HBLK "
2845         "h# %x constant HMEBLK_ENDPA "
2846         "h# %x constant HMEBUCKET_SIZE "
2847         "h# %x constant HTAG_SFMMUPSZ "
2848         "h# %x constant HTAG_BSPAGE_SHIFT "
2849         "h# %x constant HTAG_REHASH_SHIFT "
2850         "h# %x constant SFMMU_INVALID_SHMERID "
2851         "h# %x constant mmu_hashcnt "
2852         "h# %p constant uhme_hash "
2853         "h# %p constant khme_hash "
2854         "h# %x constant UHMEHASH_SZ "
2855         "h# %x constant KHMEHASH_SZ "
2856         "h# %p constant KCONTEXT "
2857         "h# %p constant KHATID "
2858         "h# %x constant ASI_MEM "
2859 
2860         ": PHYS-X@ ( phys -- data ) "
2861         "   ASI_MEM spacex@ "
2862         "; "
2863 
2864         ": PHYS-W@ ( phys -- data ) "
2865         "   ASI_MEM spacew@ "
2866         "; "
2867 
2868         ": PHYS-L@ ( phys -- data ) "
2869         "   ASI_MEM spaceL@ "
2870         "; "
2871 
2872         ": TTE_PAGE_SHIFT ( ttesz -- hmeshift ) "
2873         "   3 * MMU_PAGESHIFT + "
2874         "; "
2875 
2876         ": TTE_IS_VALID ( ttep -- flag ) "
2877         "   PHYS-X@ 0< "
2878         "; "
2879 
2880         ": HME_HASH_SHIFT ( ttesz -- hmeshift ) "
2881         "   dup TTE8K =  if "
2882         "      drop HBLK_RANGE_SHIFT "
2883         "   else "
2884         "      TTE_PAGE_SHIFT "
2885         "   then "
2886         "; "
2887 
2888         ": HME_HASH_BSPAGE ( addr hmeshift -- bspage ) "
2889         "   tuck >> swap MMU_PAGESHIFT - << "
2890         "; "
2891 
2892         ": HME_HASH_FUNCTION ( sfmmup addr hmeshift -- hmebp ) "
2893         "   >> over xor swap                    ( hash sfmmup ) "
2894         "   KHATID <>  if                       ( hash ) "
2895         "      UHMEHASH_SZ and                  ( bucket ) "
2896         "      HMEBUCKET_SIZE * uhme_hash +     ( hmebp ) "
2897         "   else                                ( hash ) "
2898         "      KHMEHASH_SZ and                  ( bucket ) "
2899         "      HMEBUCKET_SIZE * khme_hash +     ( hmebp ) "
2900         "   then                                ( hmebp ) "
2901         "; "
2902 
2903         ": HME_HASH_TABLE_SEARCH "
2904         "       ( sfmmup hmebp hblktag --  sfmmup null | sfmmup hmeblkp ) "
2905         "   >r hmebp_hblk + phys-x@ begin ( sfmmup hmeblkp ) ( r: hblktag ) "
2906         "      dup HMEBLK_ENDPA <> if     ( sfmmup hmeblkp ) ( r: hblktag ) "
2907         "         dup hmeblk_tag + phys-x@ r@ = if ( sfmmup hmeblkp )     "
2908         "            dup hmeblk_tag + 8 + phys-x@ 2 pick = if             "
2909         "                 true  ( sfmmup hmeblkp true ) ( r: hblktag )    "
2910         "            else                                                 "
2911         "                 hmeblk_next + phys-x@ false                     "
2912         "                       ( sfmmup hmeblkp false ) ( r: hblktag )   "
2913         "            then                                                 "
2914         "         else                                                    "
2915         "            hmeblk_next + phys-x@ false                          "
2916         "                       ( sfmmup hmeblkp false ) ( r: hblktag )   "
2917         "         then                                                    "
2918         "      else                                                       "
2919         "         drop 0 true                                             "
2920         "      then                                                       "
2921         "   until r> drop                                              "
2922         "; "
2923 
2924         ": HME_HASH_TAG ( sfmmup rehash addr -- hblktag ) "
2925         "   over HME_HASH_SHIFT HME_HASH_BSPAGE  ( sfmmup rehash bspage ) "
2926         "   HTAG_BSPAGE_SHIFT <<           ( sfmmup rehash htag-bspage )"
2927         "   swap HTAG_REHASH_SHIFT << or   ( sfmmup htag-bspage-rehash )"
2928         "   SFMMU_INVALID_SHMERID or nip         ( hblktag ) "
2929         "; "
2930 
2931         ": HBLK_TO_TTEP ( hmeblkp addr -- ttep ) "
2932         "   over HMEBLK_MISC + PHYS-L@ HBLK_SZMASK and  ( hmeblkp addr ttesz ) "
2933         "   TTE8K =  if                            ( hmeblkp addr ) "
2934         "      MMU_PAGESHIFT >> NHMENTS 1- and     ( hmeblkp hme-index ) "
2935         "   else                                   ( hmeblkp addr ) "
2936         "      drop 0                              ( hmeblkp 0 ) "
2937         "   then                                   ( hmeblkp hme-index ) "
2938         "   SFHME_SIZE * + HMEBLK_HME1 +           ( hmep ) "
2939         "   SFHME_TTE +                            ( ttep ) "
2940         "; "
2941 
2942         ": unix-tte ( addr cnum -- false | tte-data true ) "
2943         "    KCONTEXT = if                   ( addr ) "
2944         "       KHATID                       ( addr khatid ) "
2945         "    else                            ( addr ) "
2946         "       drop false exit              ( false ) "
2947         "    then "
2948         "      ( addr khatid ) "
2949         "      mmu_hashcnt 1+ 1  do           ( addr sfmmup ) "
2950         "         2dup swap i HME_HASH_SHIFT  "
2951                                         "( addr sfmmup sfmmup addr hmeshift ) "
2952         "         HME_HASH_FUNCTION           ( addr sfmmup hmebp ) "
2953         "         over i 4 pick               "
2954                                 "( addr sfmmup hmebp sfmmup rehash addr ) "
2955         "         HME_HASH_TAG                ( addr sfmmup hmebp hblktag ) "
2956         "         HME_HASH_TABLE_SEARCH       "
2957                                         "( addr sfmmup { null | hmeblkp } ) "
2958         "         ?dup  if                    ( addr sfmmup hmeblkp ) "
2959         "            nip swap HBLK_TO_TTEP    ( ttep ) "
2960         "            dup TTE_IS_VALID  if     ( valid-ttep ) "
2961         "               PHYS-X@ true          ( tte-data true ) "
2962         "            else                     ( invalid-tte ) "
2963         "               drop false            ( false ) "
2964         "            then                     ( false | tte-data true ) "
2965         "            unloop exit              ( false | tte-data true ) "
2966         "         then                        ( addr sfmmup ) "
2967         "      loop                           ( addr sfmmup ) "
2968         "      2drop false                    ( false ) "
2969         "; "
2970 ;
2971 
2972 void
2973 create_va_to_tte(void)
2974 {
2975         char *bp;
2976         extern int khmehash_num, uhmehash_num;
2977         extern struct hmehash_bucket *khme_hash, *uhme_hash;
2978 
2979 #define OFFSET(type, field)     ((uintptr_t)(&((type *)0)->field))
2980 
2981         bp = (char *)kobj_zalloc(MMU_PAGESIZE, KM_SLEEP);
2982 
2983         /*
2984          * Teach obp how to parse our sw ttes.
2985          */
2986         (void) sprintf(bp, obp_tte_str,
2987             MMU_PAGESHIFT,
2988             TTE8K,
2989             sizeof (struct sf_hment),
2990             OFFSET(struct sf_hment, hme_tte),
2991             OFFSET(struct hme_blk, hblk_tag),
2992             OFFSET(struct hme_blk, hblk_nextpa),
2993             OFFSET(struct hme_blk, hblk_misc),
2994             OFFSET(struct hme_blk, hblk_hme),
2995             NHMENTS,
2996             HBLK_SZMASK,
2997             HBLK_RANGE_SHIFT,
2998             OFFSET(struct hmehash_bucket, hmeh_nextpa),
2999             HMEBLK_ENDPA,
3000             sizeof (struct hmehash_bucket),
3001             HTAG_SFMMUPSZ,
3002             HTAG_BSPAGE_SHIFT,
3003             HTAG_REHASH_SHIFT,
3004             SFMMU_INVALID_SHMERID,
3005             mmu_hashcnt,
3006             (caddr_t)va_to_pa((caddr_t)uhme_hash),
3007             (caddr_t)va_to_pa((caddr_t)khme_hash),
3008             UHMEHASH_SZ,
3009             KHMEHASH_SZ,
3010             KCONTEXT,
3011             KHATID,
3012             ASI_MEM);
3013         prom_interpret(bp, 0, 0, 0, 0, 0);
3014 
3015         kobj_free(bp, MMU_PAGESIZE);
3016 }
3017 
3018 void
3019 install_va_to_tte(void)
3020 {
3021         /*
3022          * advise prom that it can use unix-tte
3023          */
3024         prom_interpret("' unix-tte is va>tte-data", 0, 0, 0, 0, 0);
3025 }
3026 
3027 /*
3028  * Here we add "device-type=console" for /os-io node, for currently
3029  * our kernel console output only supports displaying text and
3030  * performing cursor-positioning operations (through kernel framebuffer
3031  * driver) and it doesn't support other functionalities required for a
3032  * standard "display" device as specified in 1275 spec. The main missing
3033  * interface defined by the 1275 spec is "draw-logo".
3034  * also see the comments above prom_stdout_is_framebuffer().
3035  */
3036 static char *create_node =
3037         "\" /\" find-device "
3038         "new-device "
3039         "\" os-io\" device-name "
3040         "\" "OBP_DISPLAY_CONSOLE"\" device-type "
3041         ": cb-r/w  ( adr,len method$ -- #read/#written ) "
3042         "   2>r swap 2 2r> ['] $callback  catch  if "
3043         "      2drop 3drop 0 "
3044         "   then "
3045         "; "
3046         ": read ( adr,len -- #read ) "
3047         "       \" read\" ['] cb-r/w catch  if  2drop 2drop -2 exit then "
3048         "       ( retN ... ret1 N ) "
3049         "       ?dup  if "
3050         "               swap >r 1-  0  ?do  drop  loop  r> "
3051         "       else "
3052         "               -2 "
3053         "       then "
3054         ";    "
3055         ": write ( adr,len -- #written ) "
3056         "       \" write\" ['] cb-r/w catch  if  2drop 2drop 0 exit  then "
3057         "       ( retN ... ret1 N ) "
3058         "       ?dup  if "
3059         "               swap >r 1-  0  ?do  drop  loop  r> "
3060         "        else "
3061         "               0 "
3062         "       then "
3063         "; "
3064         ": poll-tty ( -- ) ; "
3065         ": install-abort  ( -- )  ['] poll-tty d# 10 alarm ; "
3066         ": remove-abort ( -- )  ['] poll-tty 0 alarm ; "
3067         ": cb-give/take ( $method -- ) "
3068         "       0 -rot ['] $callback catch  ?dup  if "
3069         "               >r 2drop 2drop r> throw "
3070         "       else "
3071         "               0  ?do  drop  loop "
3072         "       then "
3073         "; "
3074         ": give ( -- )  \" exit-input\" cb-give/take ; "
3075         ": take ( -- )  \" enter-input\" cb-give/take ; "
3076         ": open ( -- ok? )  true ; "
3077         ": close ( -- ) ; "
3078         "finish-device "
3079         "device-end ";
3080 
3081 /*
3082  * Create the OBP input/output node (FCode serial driver).
3083  * It is needed for both USB console keyboard and for
3084  * the kernel terminal emulator.  It is too early to check for a
3085  * kernel console compatible framebuffer now, so we create this
3086  * so that we're ready if we need to enable kernel terminal emulation.
3087  *
3088  * When the USB software takes over the input device at the time
3089  * consconfig runs, OBP's stdin is redirected to this node.
3090  * Whenever the FORTH user interface is used after this switch,
3091  * the node will call back into the kernel for console input.
3092  * If a serial device such as ttya or a UART with a Type 5 keyboard
3093  * attached is used, OBP takes over the serial device when the system
3094  * goes to the debugger after the system is booted.  This sharing
3095  * of the relatively simple serial device is difficult but possible.
3096  * Sharing the USB host controller is impossible due its complexity.
3097  *
3098  * Similarly to USB keyboard input redirection, after consconfig_dacf
3099  * configures a kernel console framebuffer as the standard output
3100  * device, OBP's stdout is switched to to vector through the
3101  * /os-io node into the kernel terminal emulator.
3102  */
3103 static void
3104 startup_create_io_node(void)
3105 {
3106         prom_interpret(create_node, 0, 0, 0, 0, 0);
3107 }
3108 
3109 
3110 static void
3111 do_prom_version_check(void)
3112 {
3113         int i;
3114         pnode_t node;
3115         char buf[64];
3116         static char drev[] = "Down-rev firmware detected%s\n"
3117             "\tPlease upgrade to the following minimum version:\n"
3118             "\t\t%s\n";
3119 
3120         i = prom_version_check(buf, sizeof (buf), &node);
3121 
3122         if (i == PROM_VER64_OK)
3123                 return;
3124 
3125         if (i == PROM_VER64_UPGRADE) {
3126                 cmn_err(CE_WARN, drev, "", buf);
3127 
3128 #ifdef  DEBUG
3129                 prom_enter_mon();       /* Type 'go' to continue */
3130                 cmn_err(CE_WARN, "Booting with down-rev firmware\n");
3131                 return;
3132 #else
3133                 halt(0);
3134 #endif
3135         }
3136 
3137         /*
3138          * The other possibility is that this is a server running
3139          * good firmware, but down-rev firmware was detected on at
3140          * least one other cpu board. We just complain if we see
3141          * that.
3142          */
3143         cmn_err(CE_WARN, drev, " on one or more CPU boards", buf);
3144 }
3145 
3146 
3147 /*
3148  * Must be defined in platform dependent code.
3149  */
3150 extern caddr_t modtext;
3151 extern size_t modtext_sz;
3152 extern caddr_t moddata;
3153 
3154 #define HEAPTEXT_ARENA(addr)    \
3155         ((uintptr_t)(addr) < KERNELBASE + 2 * MMU_PAGESIZE4M ? 0 : \
3156         (((uintptr_t)(addr) - HEAPTEXT_BASE) / \
3157         (HEAPTEXT_MAPPED + HEAPTEXT_UNMAPPED) + 1))
3158 
3159 #define HEAPTEXT_OVERSIZED(addr)        \
3160         ((uintptr_t)(addr) >= HEAPTEXT_BASE + HEAPTEXT_SIZE - HEAPTEXT_OVERSIZE)
3161 
3162 #define HEAPTEXT_IN_NUCLEUSDATA(addr) \
3163         (((uintptr_t)(addr) >= KERNELBASE + 2 * MMU_PAGESIZE4M) && \
3164         ((uintptr_t)(addr) < KERNELBASE + 3 * MMU_PAGESIZE4M))
3165 
3166 vmem_t *texthole_source[HEAPTEXT_NARENAS];
3167 vmem_t *texthole_arena[HEAPTEXT_NARENAS];
3168 kmutex_t texthole_lock;
3169 
3170 char kern_bootargs[OBP_MAXPATHLEN];
3171 char kern_bootfile[OBP_MAXPATHLEN];
3172 
3173 void
3174 kobj_vmem_init(vmem_t **text_arena, vmem_t **data_arena)
3175 {
3176         uintptr_t addr, limit;
3177 
3178         addr = HEAPTEXT_BASE;
3179         limit = addr + HEAPTEXT_SIZE - HEAPTEXT_OVERSIZE;
3180 
3181         /*
3182          * Before we initialize the text_arena, we want to punch holes in the
3183          * underlying heaptext_arena.  This guarantees that for any text
3184          * address we can find a text hole less than HEAPTEXT_MAPPED away.
3185          */
3186         for (; addr + HEAPTEXT_UNMAPPED <= limit;
3187             addr += HEAPTEXT_MAPPED + HEAPTEXT_UNMAPPED) {
3188                 (void) vmem_xalloc(heaptext_arena, HEAPTEXT_UNMAPPED, PAGESIZE,
3189                     0, 0, (void *)addr, (void *)(addr + HEAPTEXT_UNMAPPED),
3190                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
3191         }
3192 
3193         /*
3194          * Allocate one page at the oversize to break up the text region
3195          * from the oversized region.
3196          */
3197         (void) vmem_xalloc(heaptext_arena, PAGESIZE, PAGESIZE, 0, 0,
3198             (void *)limit, (void *)(limit + PAGESIZE),
3199             VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
3200 
3201         *text_arena = vmem_create("module_text", modtext_sz ? modtext : NULL,
3202             modtext_sz, sizeof (uintptr_t), segkmem_alloc, segkmem_free,
3203             heaptext_arena, 0, VM_SLEEP);
3204         *data_arena = vmem_create("module_data", moddata, MODDATA, 1,
3205             segkmem_alloc, segkmem_free, heap32_arena, 0, VM_SLEEP);
3206 }
3207 
3208 caddr_t
3209 kobj_text_alloc(vmem_t *arena, size_t size)
3210 {
3211         caddr_t rval, better;
3212 
3213         /*
3214          * First, try a sleeping allocation.
3215          */
3216         rval = vmem_alloc(arena, size, VM_SLEEP | VM_BESTFIT);
3217 
3218         if (size >= HEAPTEXT_MAPPED || !HEAPTEXT_OVERSIZED(rval))
3219                 return (rval);
3220 
3221         /*
3222          * We didn't get the area that we wanted.  We're going to try to do an
3223          * allocation with explicit constraints.
3224          */
3225         better = vmem_xalloc(arena, size, sizeof (uintptr_t), 0, 0, NULL,
3226             (void *)(HEAPTEXT_BASE + HEAPTEXT_SIZE - HEAPTEXT_OVERSIZE),
3227             VM_NOSLEEP | VM_BESTFIT);
3228 
3229         if (better != NULL) {
3230                 /*
3231                  * That worked.  Free our first attempt and return.
3232                  */
3233                 vmem_free(arena, rval, size);
3234                 return (better);
3235         }
3236 
3237         /*
3238          * That didn't work; we'll have to return our first attempt.
3239          */
3240         return (rval);
3241 }
3242 
3243 caddr_t
3244 kobj_texthole_alloc(caddr_t addr, size_t size)
3245 {
3246         int arena = HEAPTEXT_ARENA(addr);
3247         char c[30];
3248         uintptr_t base;
3249 
3250         if (HEAPTEXT_OVERSIZED(addr) || HEAPTEXT_IN_NUCLEUSDATA(addr)) {
3251                 /*
3252                  * If this is an oversized allocation or it is allocated in
3253                  * the nucleus data page, there is no text hole available for
3254                  * it; return NULL.
3255                  */
3256                 return (NULL);
3257         }
3258 
3259         mutex_enter(&texthole_lock);
3260 
3261         if (texthole_arena[arena] == NULL) {
3262                 ASSERT(texthole_source[arena] == NULL);
3263 
3264                 if (arena == 0) {
3265                         texthole_source[0] = vmem_create("module_text_holesrc",
3266                             (void *)(KERNELBASE + MMU_PAGESIZE4M),
3267                             MMU_PAGESIZE4M, PAGESIZE, NULL, NULL, NULL,
3268                             0, VM_SLEEP);
3269                 } else {
3270                         base = HEAPTEXT_BASE +
3271                             (arena - 1) * (HEAPTEXT_MAPPED + HEAPTEXT_UNMAPPED);
3272 
3273                         (void) snprintf(c, sizeof (c),
3274                             "heaptext_holesrc_%d", arena);
3275 
3276                         texthole_source[arena] = vmem_create(c, (void *)base,
3277                             HEAPTEXT_UNMAPPED, PAGESIZE, NULL, NULL, NULL,
3278                             0, VM_SLEEP);
3279                 }
3280 
3281                 (void) snprintf(c, sizeof (c), "heaptext_hole_%d", arena);
3282 
3283                 texthole_arena[arena] = vmem_create(c, NULL, 0,
3284                     sizeof (uint32_t), segkmem_alloc_permanent, segkmem_free,
3285                     texthole_source[arena], 0, VM_SLEEP);
3286         }
3287 
3288         mutex_exit(&texthole_lock);
3289 
3290         ASSERT(texthole_arena[arena] != NULL);
3291         ASSERT(arena >= 0 && arena < HEAPTEXT_NARENAS);
3292         return (vmem_alloc(texthole_arena[arena], size,
3293             VM_BESTFIT | VM_NOSLEEP));
3294 }
3295 
3296 void
3297 kobj_texthole_free(caddr_t addr, size_t size)
3298 {
3299         int arena = HEAPTEXT_ARENA(addr);
3300 
3301         ASSERT(arena >= 0 && arena < HEAPTEXT_NARENAS);
3302         ASSERT(texthole_arena[arena] != NULL);
3303         vmem_free(texthole_arena[arena], addr, size);
3304 }
3305 
3306 void
3307 release_bootstrap(void)
3308 {
3309         if (&cif_init)
3310                 cif_init();
3311 }