1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 /* 29 * Copyright 2011 Joyent, Inc. All rights reserved. 30 */ 31 32 /* 33 * Welcome to the world of the "real mode platter". 34 * See also startup.c, mpcore.s and apic.c for related routines. 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/cpuvar.h> 40 #include <sys/cpu_module.h> 41 #include <sys/kmem.h> 42 #include <sys/archsystm.h> 43 #include <sys/machsystm.h> 44 #include <sys/controlregs.h> 45 #include <sys/x86_archext.h> 46 #include <sys/smp_impldefs.h> 47 #include <sys/sysmacros.h> 48 #include <sys/mach_mmu.h> 49 #include <sys/promif.h> 50 #include <sys/cpu.h> 51 #include <sys/cpu_event.h> 52 #include <sys/sunndi.h> 53 #include <sys/fs/dv_node.h> 54 #include <vm/hat_i86.h> 55 #include <vm/as.h> 56 57 extern cpuset_t cpu_ready_set; 58 59 extern int mp_start_cpu_common(cpu_t *cp, boolean_t boot); 60 extern void real_mode_start_cpu(void); 61 extern void real_mode_start_cpu_end(void); 62 extern void real_mode_stop_cpu_stage1(void); 63 extern void real_mode_stop_cpu_stage1_end(void); 64 extern void real_mode_stop_cpu_stage2(void); 65 extern void real_mode_stop_cpu_stage2_end(void); 66 67 void rmp_gdt_init(rm_platter_t *); 68 69 /* 70 * Fill up the real mode platter to make it easy for real mode code to 71 * kick it off. This area should really be one passed by boot to kernel 72 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also 73 * have identical physical and virtual address in paged mode. 74 */ 75 static ushort_t *warm_reset_vector = NULL; 76 77 int 78 mach_cpucontext_init(void) 79 { 80 ushort_t *vec; 81 ulong_t addr; 82 struct rm_platter *rm = (struct rm_platter *)rm_platter_va; 83 84 if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR, 85 sizeof (vec), PROT_READ | PROT_WRITE))) 86 return (-1); 87 88 /* 89 * setup secondary cpu bios boot up vector 90 * Write page offset to 0x467 and page frame number to 0x469. 91 */ 92 addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa; 93 vec[0] = (ushort_t)(addr & PAGEOFFSET); 94 vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4); 95 warm_reset_vector = vec; 96 97 /* Map real mode platter into kas so kernel can access it. */ 98 hat_devload(kas.a_hat, 99 (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 100 btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC, 101 HAT_LOAD_NOCONSIST); 102 103 /* Copy CPU startup code to rm_platter if it's still during boot. */ 104 if (!plat_dr_enabled()) { 105 ASSERT((size_t)real_mode_start_cpu_end - 106 (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE); 107 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 108 (size_t)real_mode_start_cpu_end - 109 (size_t)real_mode_start_cpu); 110 } 111 112 return (0); 113 } 114 115 void 116 mach_cpucontext_fini(void) 117 { 118 if (warm_reset_vector) 119 psm_unmap_phys((caddr_t)warm_reset_vector, 120 sizeof (warm_reset_vector)); 121 hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 122 HAT_UNLOAD); 123 } 124 125 #if defined(__amd64) 126 extern void *long_mode_64(void); 127 #endif /* __amd64 */ 128 129 /*ARGSUSED*/ 130 void 131 rmp_gdt_init(rm_platter_t *rm) 132 { 133 134 #if defined(__amd64) 135 /* Use the kas address space for the CPU startup thread. */ 136 if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) 137 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" 138 "located above 4G in physical memory (@ 0x%lx)", 139 MAKECR3(kas.a_hat->hat_htable->ht_pfn)); 140 141 /* 142 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY 143 * by code in real_mode_start_cpu(): 144 * 145 * GDT[0]: NULL selector 146 * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 147 * 148 * Clear the IDT as interrupts will be off and a limit of 0 will cause 149 * the CPU to triple fault and reset on an NMI, seemingly as reasonable 150 * a course of action as any other, though it may cause the entire 151 * platform to reset in some cases... 152 */ 153 rm->rm_temp_gdt[0] = 0; 154 rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; 155 156 rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); 157 rm->rm_temp_gdt_base = rm_platter_pa + 158 (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); 159 rm->rm_temp_idt_lim = 0; 160 rm->rm_temp_idt_base = 0; 161 162 /* 163 * Since the CPU needs to jump to protected mode using an identity 164 * mapped address, we need to calculate it here. 165 */ 166 rm->rm_longmode64_addr = rm_platter_pa + 167 (uint32_t)((uintptr_t)long_mode_64 - 168 (uintptr_t)real_mode_start_cpu); 169 #endif /* __amd64 */ 170 } 171 172 static void * 173 mach_cpucontext_alloc_tables(struct cpu *cp) 174 { 175 tss_t *ntss; 176 struct cpu_tables *ct; 177 178 /* 179 * Allocate space for stack, tss, gdt and idt. We round the size 180 * allotted for cpu_tables up, so that the TSS is on a unique page. 181 * This is more efficient when running in virtual machines. 182 */ 183 ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP); 184 if ((uintptr_t)ct & PAGEOFFSET) 185 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", 186 cp->cpu_id); 187 188 ntss = cp->cpu_tss = &ct->ct_tss; 189 190 #if defined(__amd64) 191 192 /* 193 * #DF (double fault). 194 */ 195 ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 196 197 #elif defined(__i386) 198 199 ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp = 200 (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 201 202 ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL; 203 204 ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc; 205 206 ntss->tss_cs = KCS_SEL; 207 ntss->tss_ds = ntss->tss_es = KDS_SEL; 208 ntss->tss_fs = KFS_SEL; 209 ntss->tss_gs = KGS_SEL; 210 211 #endif /* __i386 */ 212 213 /* 214 * Set I/O bit map offset equal to size of TSS segment limit 215 * for no I/O permission map. This will cause all user I/O 216 * instructions to generate #gp fault. 217 */ 218 ntss->tss_bitmapbase = sizeof (*ntss); 219 220 /* 221 * Setup kernel tss. 222 */ 223 set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss, 224 sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL); 225 226 return (ct); 227 } 228 229 void * 230 mach_cpucontext_xalloc(struct cpu *cp, int optype) 231 { 232 size_t len; 233 struct cpu_tables *ct; 234 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 235 static int cpu_halt_code_ready; 236 237 if (optype == MACH_CPUCONTEXT_OP_STOP) { 238 ASSERT(plat_dr_enabled()); 239 240 /* 241 * The WARM_RESET_VECTOR has a limitation that the physical 242 * address written to it must be page-aligned. To work around 243 * this limitation, the CPU stop code has been splitted into 244 * two stages. 245 * The stage 2 code, which implements the real logic to halt 246 * CPUs, is copied to the rm_cpu_halt_code field in the real 247 * mode platter. The stage 1 code, which simply jumps to the 248 * stage 2 code in the rm_cpu_halt_code field, is copied to 249 * rm_code field in the real mode platter and it may be 250 * overwritten after the CPU has been stopped. 251 */ 252 if (!cpu_halt_code_ready) { 253 /* 254 * The rm_cpu_halt_code field in the real mode platter 255 * is used by the CPU stop code only. So only copy the 256 * CPU stop stage 2 code into the rm_cpu_halt_code 257 * field on the first call. 258 */ 259 len = (size_t)real_mode_stop_cpu_stage2_end - 260 (size_t)real_mode_stop_cpu_stage2; 261 ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE); 262 bcopy((caddr_t)real_mode_stop_cpu_stage2, 263 (caddr_t)rm->rm_cpu_halt_code, len); 264 cpu_halt_code_ready = 1; 265 } 266 267 /* 268 * The rm_code field in the real mode platter is shared by 269 * the CPU start, CPU stop, CPR and fast reboot code. So copy 270 * the CPU stop stage 1 code into the rm_code field every time. 271 */ 272 len = (size_t)real_mode_stop_cpu_stage1_end - 273 (size_t)real_mode_stop_cpu_stage1; 274 ASSERT(len <= RM_PLATTER_CODE_SIZE); 275 bcopy((caddr_t)real_mode_stop_cpu_stage1, 276 (caddr_t)rm->rm_code, len); 277 rm->rm_cpu_halted = 0; 278 279 return (cp->cpu_m.mcpu_mach_ctx_ptr); 280 } else if (optype != MACH_CPUCONTEXT_OP_START) { 281 return (NULL); 282 } 283 284 /* 285 * Only need to allocate tables when starting CPU. 286 * Tables allocated when starting CPU will be reused when stopping CPU. 287 */ 288 ct = mach_cpucontext_alloc_tables(cp); 289 if (ct == NULL) { 290 return (NULL); 291 } 292 293 /* Copy CPU startup code to rm_platter for CPU hot-add operations. */ 294 if (plat_dr_enabled()) { 295 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 296 (size_t)real_mode_start_cpu_end - 297 (size_t)real_mode_start_cpu); 298 } 299 300 /* 301 * Now copy all that we've set up onto the real mode platter 302 * for the real mode code to digest as part of starting the cpu. 303 */ 304 rm->rm_idt_base = cp->cpu_idt; 305 rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1; 306 rm->rm_gdt_base = cp->cpu_gdt; 307 rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1; 308 309 /* 310 * CPU needs to access kernel address space after powering on. 311 * When hot-adding CPU at runtime, directly use top level page table 312 * of kas other than the return value of getcr3(). getcr3() returns 313 * current process's top level page table, which may be different from 314 * the one of kas. 315 */ 316 rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); 317 rm->rm_cpu = cp->cpu_id; 318 319 /* 320 * For hot-adding CPU at runtime, Machine Check and Performance Counter 321 * should be disabled. They will be enabled on demand after CPU powers 322 * on successfully 323 */ 324 rm->rm_cr4 = getcr4(); 325 rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE); 326 327 rmp_gdt_init(rm); 328 329 return (ct); 330 } 331 332 void 333 mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype) 334 { 335 struct cpu_tables *ct = arg; 336 337 ASSERT(&ct->ct_tss == cp->cpu_tss); 338 if (optype == MACH_CPUCONTEXT_OP_START) { 339 switch (err) { 340 case 0: 341 /* 342 * Save pointer for reuse when stopping CPU. 343 */ 344 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 345 break; 346 case ETIMEDOUT: 347 /* 348 * The processor was poked, but failed to start before 349 * we gave up waiting for it. In case it starts later, 350 * don't free anything. 351 */ 352 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 353 break; 354 default: 355 /* 356 * Some other, passive, error occurred. 357 */ 358 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 359 cp->cpu_tss = NULL; 360 break; 361 } 362 } else if (optype == MACH_CPUCONTEXT_OP_STOP) { 363 switch (err) { 364 case 0: 365 /* 366 * Free resources allocated when starting CPU. 367 */ 368 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 369 cp->cpu_tss = NULL; 370 cp->cpu_m.mcpu_mach_ctx_ptr = NULL; 371 break; 372 default: 373 /* 374 * Don't touch table pointer in case of failure. 375 */ 376 break; 377 } 378 } else { 379 ASSERT(0); 380 } 381 } 382 383 void * 384 mach_cpucontext_alloc(struct cpu *cp) 385 { 386 return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START)); 387 } 388 389 void 390 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 391 { 392 mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START); 393 } 394 395 /* 396 * "Enter monitor." Called via cross-call from stop_other_cpus(). 397 */ 398 void 399 mach_cpu_halt(char *msg) 400 { 401 if (msg) 402 prom_printf("%s\n", msg); 403 404 /*CONSTANTCONDITION*/ 405 while (1) 406 ; 407 } 408 409 void 410 mach_cpu_idle(void) 411 { 412 i86_halt(); 413 } 414 415 void 416 mach_cpu_pause(volatile char *safe) 417 { 418 /* 419 * This cpu is now safe. 420 */ 421 *safe = PAUSE_WAIT; 422 membar_enter(); /* make sure stores are flushed */ 423 424 /* 425 * Now we wait. When we are allowed to continue, safe 426 * will be set to PAUSE_IDLE. 427 */ 428 while (*safe != PAUSE_IDLE) 429 SMT_PAUSE(); 430 } 431 432 /* 433 * Power on the target CPU. 434 */ 435 int 436 mp_cpu_poweron(struct cpu *cp) 437 { 438 int error; 439 cpuset_t tempset; 440 processorid_t cpuid; 441 442 ASSERT(cp != NULL); 443 cpuid = cp->cpu_id; 444 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 445 return (ENOTSUP); 446 } else if (cpuid < 0 || cpuid >= max_ncpus) { 447 return (EINVAL); 448 } 449 450 /* 451 * The currrent x86 implementaiton of mp_cpu_configure() and 452 * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only 453 * be called once after calling mp_cpu_configure() for a specific CPU. 454 * It's because mp_cpu_poweron() will destroy data structure created 455 * by mp_cpu_configure(). So reject the request if the CPU has already 456 * been powered on once after calling mp_cpu_configure(). 457 * This limitaiton only affects the p_online syscall and the DR driver 458 * won't be affected because the DR driver always invoke public CPU 459 * management interfaces in the predefined order: 460 * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure() 461 */ 462 if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) { 463 return (ENOTSUP); 464 } 465 466 /* 467 * Check if there's at least a Mbyte of kmem available 468 * before attempting to start the cpu. 469 */ 470 if (kmem_avail() < 1024 * 1024) { 471 /* 472 * Kick off a reap in case that helps us with 473 * later attempts .. 474 */ 475 kmem_reap(); 476 return (ENOMEM); 477 } 478 479 affinity_set(CPU->cpu_id); 480 481 /* 482 * Start the target CPU. No need to call mach_cpucontext_fini() 483 * if mach_cpucontext_init() fails. 484 */ 485 if ((error = mach_cpucontext_init()) == 0) { 486 error = mp_start_cpu_common(cp, B_FALSE); 487 mach_cpucontext_fini(); 488 } 489 if (error != 0) { 490 affinity_clear(); 491 return (error); 492 } 493 494 /* Wait for the target cpu to reach READY state. */ 495 tempset = cpu_ready_set; 496 while (!CPU_IN_SET(tempset, cpuid)) { 497 delay(1); 498 tempset = *((volatile cpuset_t *)&cpu_ready_set); 499 } 500 501 /* Mark the target CPU as available for mp operation. */ 502 CPUSET_ATOMIC_ADD(mp_cpus, cpuid); 503 504 /* Free the space allocated to hold the microcode file */ 505 ucode_cleanup(); 506 507 affinity_clear(); 508 509 return (0); 510 } 511 512 #define MP_CPU_DETACH_MAX_TRIES 5 513 #define MP_CPU_DETACH_DELAY 100 514 515 static int 516 mp_cpu_detach_driver(dev_info_t *dip) 517 { 518 int i; 519 int rv = EBUSY; 520 dev_info_t *pdip; 521 522 pdip = ddi_get_parent(dip); 523 ASSERT(pdip != NULL); 524 /* 525 * Check if caller holds pdip busy - can cause deadlocks in 526 * e_ddi_branch_unconfigure(), which calls devfs_clean(). 527 */ 528 if (DEVI_BUSY_OWNED(pdip)) { 529 return (EDEADLOCK); 530 } 531 532 for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) { 533 if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) { 534 rv = 0; 535 break; 536 } 537 DELAY(MP_CPU_DETACH_DELAY); 538 } 539 540 return (rv); 541 } 542 543 /* 544 * Power off the target CPU. 545 * Note: cpu_lock will be released and then reacquired. 546 */ 547 int 548 mp_cpu_poweroff(struct cpu *cp) 549 { 550 int rv = 0; 551 void *ctx; 552 dev_info_t *dip = NULL; 553 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 554 extern void cpupm_start(cpu_t *); 555 extern void cpupm_stop(cpu_t *); 556 557 ASSERT(cp != NULL); 558 ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); 559 ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); 560 561 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 562 return (ENOTSUP); 563 } 564 /* 565 * There is no support for powering off cpu0 yet. 566 * There are many pieces of code which have a hard dependency on cpu0. 567 */ 568 if (cp->cpu_id == 0) { 569 return (ENOTSUP); 570 }; 571 572 if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) { 573 return (ENXIO); 574 } 575 ASSERT(dip != NULL); 576 if (mp_cpu_detach_driver(dip) != 0) { 577 rv = EBUSY; 578 goto out_online; 579 } 580 581 /* Allocate CPU context for stopping */ 582 if (mach_cpucontext_init() != 0) { 583 rv = ENXIO; 584 goto out_online; 585 } 586 ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP); 587 if (ctx == NULL) { 588 rv = ENXIO; 589 goto out_context_fini; 590 } 591 592 cpupm_stop(cp); 593 cpu_event_fini_cpu(cp); 594 595 if (cp->cpu_m.mcpu_cmi_hdl != NULL) { 596 cmi_fini(cp->cpu_m.mcpu_cmi_hdl); 597 cp->cpu_m.mcpu_cmi_hdl = NULL; 598 } 599 600 rv = mach_cpu_stop(cp, ctx); 601 if (rv != 0) { 602 goto out_enable_cmi; 603 } 604 605 /* Wait until the target CPU has been halted. */ 606 while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) { 607 delay(1); 608 } 609 rm->rm_cpu_halted = 0xffff; 610 611 /* CPU_READY has been cleared by mach_cpu_stop. */ 612 ASSERT((cp->cpu_flags & CPU_READY) == 0); 613 ASSERT((cp->cpu_flags & CPU_RUNNING) == 0); 614 cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; 615 CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id); 616 617 mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP); 618 mach_cpucontext_fini(); 619 620 return (0); 621 622 out_enable_cmi: 623 { 624 cmi_hdl_t hdl; 625 626 if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), 627 cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { 628 if (is_x86_feature(x86_featureset, X86FSET_MCA)) 629 cmi_mca_init(hdl); 630 cp->cpu_m.mcpu_cmi_hdl = hdl; 631 } 632 } 633 cpu_event_init_cpu(cp); 634 cpupm_start(cp); 635 mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP); 636 637 out_context_fini: 638 mach_cpucontext_fini(); 639 640 out_online: 641 (void) e_ddi_branch_configure(dip, NULL, 0); 642 643 if (rv != EAGAIN && rv != ETIME) { 644 rv = ENXIO; 645 } 646 647 return (rv); 648 } 649 650 /* 651 * Return vcpu state, since this could be a virtual environment that we 652 * are unaware of, return "unknown". 653 */ 654 /* ARGSUSED */ 655 int 656 vcpu_on_pcpu(processorid_t cpu) 657 { 658 return (VCPU_STATE_UNKNOWN); 659 }