1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/cpuvar.h>
  26 #include <sys/systm.h>
  27 #include <sys/sysmacros.h>
  28 #include <sys/promif.h>
  29 #include <sys/platform_module.h>
  30 #include <sys/cmn_err.h>
  31 #include <sys/errno.h>
  32 #include <sys/machsystm.h>
  33 #include <sys/bootconf.h>
  34 #include <sys/nvpair.h>
  35 #include <sys/kobj.h>
  36 #include <sys/mem_cage.h>
  37 #include <sys/opl.h>
  38 #include <sys/scfd/scfostoescf.h>
  39 #include <sys/cpu_sgnblk_defs.h>
  40 #include <sys/utsname.h>
  41 #include <sys/ddi.h>
  42 #include <sys/sunndi.h>
  43 #include <sys/lgrp.h>
  44 #include <sys/memnode.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/time.h>
  47 #include <sys/cpu.h>
  48 #include <sys/dumphdr.h>
  49 #include <vm/vm_dep.h>
  50 
  51 int (*opl_get_mem_unum)(int, uint64_t, char *, int, int *);
  52 int (*opl_get_mem_sid)(char *unum, char *buf, int buflen, int *lenp);
  53 int (*opl_get_mem_offset)(uint64_t paddr, uint64_t *offp);
  54 int (*opl_get_mem_addr)(char *unum, char *sid,
  55     uint64_t offset, uint64_t *paddr);
  56 
  57 /* Memory for fcode claims.  16k times # maximum possible IO units */
  58 #define EFCODE_SIZE     (OPL_MAX_BOARDS * OPL_MAX_IO_UNITS_PER_BOARD * 0x4000)
  59 int efcode_size = EFCODE_SIZE;
  60 
  61 #define OPL_MC_MEMBOARD_SHIFT 38        /* Boards on 256BG boundary */
  62 
  63 /* Set the maximum number of boards for DR */
  64 int opl_boards = OPL_MAX_BOARDS;
  65 
  66 void sgn_update_all_cpus(ushort_t, uchar_t, uchar_t);
  67 
  68 extern int tsb_lgrp_affinity;
  69 
  70 int opl_tsb_spares = (OPL_MAX_BOARDS) * (OPL_MAX_PCICH_UNITS_PER_BOARD) *
  71         (OPL_MAX_TSBS_PER_PCICH);
  72 
  73 pgcnt_t opl_startup_cage_size = 0;
  74 
  75 /*
  76  * The length of the delay in seconds in communication with XSCF after
  77  * which the warning message will be logged.
  78  */
  79 uint_t  xscf_connect_delay = 60 * 15;
  80 
  81 static opl_model_info_t opl_models[] = {
  82         { "FF1", OPL_MAX_BOARDS_FF1, FF1, STD_DISPATCH_TABLE },
  83         { "FF2", OPL_MAX_BOARDS_FF2, FF2, STD_DISPATCH_TABLE },
  84         { "DC1", OPL_MAX_BOARDS_DC1, DC1, STD_DISPATCH_TABLE },
  85         { "DC2", OPL_MAX_BOARDS_DC2, DC2, EXT_DISPATCH_TABLE },
  86         { "DC3", OPL_MAX_BOARDS_DC3, DC3, EXT_DISPATCH_TABLE },
  87         { "IKKAKU", OPL_MAX_BOARDS_IKKAKU, IKKAKU, STD_DISPATCH_TABLE },
  88 };
  89 static  int     opl_num_models = sizeof (opl_models)/sizeof (opl_model_info_t);
  90 
  91 /*
  92  * opl_cur_model
  93  */
  94 static  opl_model_info_t *opl_cur_model = NULL;
  95 
  96 static struct memlist *opl_memlist_per_board(struct memlist *ml);
  97 static void post_xscf_msg(char *, int);
  98 static void pass2xscf_thread();
  99 
 100 /*
 101  * Note FF/DC out-of-order instruction engine takes only a
 102  * single cycle to execute each spin loop
 103  * for comparison, Panther takes 6 cycles for same loop
 104  * OPL_BOFF_SPIN = base spin loop, roughly one memory reference time
 105  * OPL_BOFF_TM = approx nsec for OPL sleep instruction (1600 for OPL-C)
 106  * OPL_BOFF_SLEEP = approx number of SPIN iterations to equal one sleep
 107  * OPL_BOFF_MAX_SCALE - scaling factor for max backoff based on active cpus
 108  * Listed values tuned for 2.15GHz to 2.64GHz systems
 109  * Value may change for future systems
 110  */
 111 #define OPL_BOFF_SPIN 7
 112 #define OPL_BOFF_SLEEP 4
 113 #define OPL_BOFF_TM 1600
 114 #define OPL_BOFF_MAX_SCALE 8
 115 
 116 #define OPL_CLOCK_TICK_THRESHOLD        128
 117 #define OPL_CLOCK_TICK_NCPUS            64
 118 
 119 extern int      clock_tick_threshold;
 120 extern int      clock_tick_ncpus;
 121 
 122 int
 123 set_platform_max_ncpus(void)
 124 {
 125         return (OPL_MAX_CPU_PER_BOARD * OPL_MAX_BOARDS);
 126 }
 127 
 128 int
 129 set_platform_tsb_spares(void)
 130 {
 131         return (MIN(opl_tsb_spares, MAX_UPA));
 132 }
 133 
 134 static void
 135 set_model_info()
 136 {
 137         extern int ts_dispatch_extended;
 138         char    name[MAXSYSNAME];
 139         int     i;
 140 
 141         /*
 142          * Get model name from the root node.
 143          *
 144          * We are using the prom device tree since, at this point,
 145          * the Solaris device tree is not yet setup.
 146          */
 147         (void) prom_getprop(prom_rootnode(), "model", (caddr_t)name);
 148 
 149         for (i = 0; i < opl_num_models; i++) {
 150                 if (strncmp(name, opl_models[i].model_name, MAXSYSNAME) == 0) {
 151                         opl_cur_model = &opl_models[i];
 152                         break;
 153                 }
 154         }
 155 
 156         /*
 157          * If model not matched, it's an unknown model.
 158          * Just return.  It will default to standard dispatch tables.
 159          */
 160         if (i == opl_num_models)
 161                 return;
 162 
 163         if ((opl_cur_model->model_cmds & EXT_DISPATCH_TABLE) &&
 164             (ts_dispatch_extended == -1)) {
 165                 /*
 166                  * Based on a platform model, select a dispatch table.
 167                  * Only DC2 and DC3 systems uses the alternate/extended
 168                  * TS dispatch table.
 169                  * IKKAKU, FF1, FF2 and DC1 systems use standard dispatch
 170                  * tables.
 171                  */
 172                 ts_dispatch_extended = 1;
 173         }
 174 
 175 }
 176 
 177 static void
 178 set_max_mmu_ctxdoms()
 179 {
 180         extern uint_t   max_mmu_ctxdoms;
 181         int             max_boards;
 182 
 183         /*
 184          * From the model, get the maximum number of boards
 185          * supported and set the value accordingly. If the model
 186          * could not be determined or recognized, we assume the max value.
 187          */
 188         if (opl_cur_model == NULL)
 189                 max_boards = OPL_MAX_BOARDS;
 190         else
 191                 max_boards = opl_cur_model->model_max_boards;
 192 
 193         /*
 194          * On OPL, cores and MMUs are one-to-one.
 195          */
 196         max_mmu_ctxdoms = OPL_MAX_CORE_UNITS_PER_BOARD * max_boards;
 197 }
 198 
 199 #pragma weak mmu_init_large_pages
 200 
 201 void
 202 set_platform_defaults(void)
 203 {
 204         extern char *tod_module_name;
 205         extern void cpu_sgn_update(ushort_t, uchar_t, uchar_t, int);
 206         extern void mmu_init_large_pages(size_t);
 207 
 208         /* Set the CPU signature function pointer */
 209         cpu_sgn_func = cpu_sgn_update;
 210 
 211         /* Set appropriate tod module for OPL platform */
 212         ASSERT(tod_module_name == NULL);
 213         tod_module_name = "todopl";
 214 
 215         if ((mmu_page_sizes == max_mmu_page_sizes) &&
 216             (mmu_ism_pagesize != DEFAULT_ISM_PAGESIZE)) {
 217                 if (&mmu_init_large_pages)
 218                         mmu_init_large_pages(mmu_ism_pagesize);
 219         }
 220 
 221         tsb_lgrp_affinity = 1;
 222 
 223         set_max_mmu_ctxdoms();
 224 
 225         /* set OPL threshold for compressed dumps */
 226         dump_plat_mincpu_default = DUMP_PLAT_SUN4U_OPL_MINCPU;
 227 }
 228 
 229 /*
 230  * Convert logical a board number to a physical one.
 231  */
 232 
 233 #define LSBPROP         "board#"
 234 #define PSBPROP         "physical-board#"
 235 
 236 int
 237 opl_get_physical_board(int id)
 238 {
 239         dev_info_t      *root_dip, *dip = NULL;
 240         char            *dname = NULL;
 241         int             circ;
 242 
 243         pnode_t         pnode;
 244         char            pname[MAXSYSNAME] = {0};
 245 
 246         int             lsb_id; /* Logical System Board ID */
 247         int             psb_id; /* Physical System Board ID */
 248 
 249 
 250         /*
 251          * This function is called on early stage of bootup when the
 252          * kernel device tree is not initialized yet, and also
 253          * later on when the device tree is up. We want to try
 254          * the fast track first.
 255          */
 256         root_dip = ddi_root_node();
 257         if (root_dip) {
 258                 /* Get from devinfo node */
 259                 ndi_devi_enter(root_dip, &circ);
 260                 for (dip = ddi_get_child(root_dip); dip;
 261                     dip = ddi_get_next_sibling(dip)) {
 262 
 263                         dname = ddi_node_name(dip);
 264                         if (strncmp(dname, "pseudo-mc", 9) != 0)
 265                                 continue;
 266 
 267                         if ((lsb_id = (int)ddi_getprop(DDI_DEV_T_ANY, dip,
 268                             DDI_PROP_DONTPASS, LSBPROP, -1)) == -1)
 269                                 continue;
 270 
 271                         if (id == lsb_id) {
 272                                 if ((psb_id = (int)ddi_getprop(DDI_DEV_T_ANY,
 273                                     dip, DDI_PROP_DONTPASS, PSBPROP, -1))
 274                                     == -1) {
 275                                         ndi_devi_exit(root_dip, circ);
 276                                         return (-1);
 277                                 } else {
 278                                         ndi_devi_exit(root_dip, circ);
 279                                         return (psb_id);
 280                                 }
 281                         }
 282                 }
 283                 ndi_devi_exit(root_dip, circ);
 284         }
 285 
 286         /*
 287          * We do not have the kernel device tree, or we did not
 288          * find the node for some reason (let's say the kernel
 289          * device tree was modified), let's try the OBP tree.
 290          */
 291         pnode = prom_rootnode();
 292         for (pnode = prom_childnode(pnode); pnode;
 293             pnode = prom_nextnode(pnode)) {
 294 
 295                 if ((prom_getprop(pnode, "name", (caddr_t)pname) == -1) ||
 296                     (strncmp(pname, "pseudo-mc", 9) != 0))
 297                         continue;
 298 
 299                 if (prom_getprop(pnode, LSBPROP, (caddr_t)&lsb_id) == -1)
 300                         continue;
 301 
 302                 if (id == lsb_id) {
 303                         if (prom_getprop(pnode, PSBPROP,
 304                             (caddr_t)&psb_id) == -1) {
 305                                 return (-1);
 306                         } else {
 307                                 return (psb_id);
 308                         }
 309                 }
 310         }
 311 
 312         return (-1);
 313 }
 314 
 315 /*
 316  * For OPL it's possible that memory from two or more successive boards
 317  * will be contiguous across the boards, and therefore represented as a
 318  * single chunk.
 319  * This function splits such chunks down the board boundaries.
 320  */
 321 static struct memlist *
 322 opl_memlist_per_board(struct memlist *ml)
 323 {
 324         uint64_t ssize, low, high, boundary;
 325         struct memlist *head, *tail, *new;
 326 
 327         ssize = (1ull << OPL_MC_MEMBOARD_SHIFT);
 328 
 329         head = tail = NULL;
 330 
 331         for (; ml; ml = ml->ml_next) {
 332                 low  = (uint64_t)ml->ml_address;
 333                 high = low+(uint64_t)(ml->ml_size);
 334                 while (low < high) {
 335                         boundary = roundup(low+1, ssize);
 336                         boundary = MIN(high, boundary);
 337                         new = kmem_zalloc(sizeof (struct memlist), KM_SLEEP);
 338                         new->ml_address = low;
 339                         new->ml_size = boundary - low;
 340                         if (head == NULL)
 341                                 head = new;
 342                         if (tail) {
 343                                 tail->ml_next = new;
 344                                 new->ml_prev = tail;
 345                         }
 346                         tail = new;
 347                         low = boundary;
 348                 }
 349         }
 350         return (head);
 351 }
 352 
 353 void
 354 set_platform_cage_params(void)
 355 {
 356         extern pgcnt_t total_pages;
 357         extern struct memlist *phys_avail;
 358         struct memlist *ml, *tml;
 359 
 360         if (kernel_cage_enable) {
 361                 pgcnt_t preferred_cage_size;
 362 
 363                 preferred_cage_size = MAX(opl_startup_cage_size,
 364                     total_pages / 256);
 365 
 366                 ml = opl_memlist_per_board(phys_avail);
 367 
 368                 /*
 369                  * Note: we are assuming that post has load the
 370                  * whole show in to the high end of memory. Having
 371                  * taken this leap, we copy the whole of phys_avail
 372                  * the glist and arrange for the cage to grow
 373                  * downward (descending pfns).
 374                  */
 375                 kcage_range_init(ml, KCAGE_DOWN, preferred_cage_size);
 376 
 377                 /* free the memlist */
 378                 do {
 379                         tml = ml->ml_next;
 380                         kmem_free(ml, sizeof (struct memlist));
 381                         ml = tml;
 382                 } while (ml != NULL);
 383         }
 384 
 385         if (kcage_on)
 386                 cmn_err(CE_NOTE, "!DR Kernel Cage is ENABLED");
 387         else
 388                 cmn_err(CE_NOTE, "!DR Kernel Cage is DISABLED");
 389 }
 390 
 391 /*ARGSUSED*/
 392 int
 393 plat_cpu_poweron(struct cpu *cp)
 394 {
 395         int (*opl_cpu_poweron)(struct cpu *) = NULL;
 396 
 397         opl_cpu_poweron =
 398             (int (*)(struct cpu *))kobj_getsymvalue("drmach_cpu_poweron", 0);
 399 
 400         if (opl_cpu_poweron == NULL)
 401                 return (ENOTSUP);
 402         else
 403                 return ((opl_cpu_poweron)(cp));
 404 
 405 }
 406 
 407 /*ARGSUSED*/
 408 int
 409 plat_cpu_poweroff(struct cpu *cp)
 410 {
 411         int (*opl_cpu_poweroff)(struct cpu *) = NULL;
 412 
 413         opl_cpu_poweroff =
 414             (int (*)(struct cpu *))kobj_getsymvalue("drmach_cpu_poweroff", 0);
 415 
 416         if (opl_cpu_poweroff == NULL)
 417                 return (ENOTSUP);
 418         else
 419                 return ((opl_cpu_poweroff)(cp));
 420 
 421 }
 422 
 423 int
 424 plat_max_boards(void)
 425 {
 426         /*
 427          * If the model cannot be determined, default to the max value.
 428          * Otherwise, Ikkaku model only supports 1 system board.
 429          */
 430         if ((opl_cur_model != NULL) && (opl_cur_model->model_type == IKKAKU))
 431                 return (OPL_MAX_BOARDS_IKKAKU);
 432         else
 433                 return (OPL_MAX_BOARDS);
 434 }
 435 
 436 int
 437 plat_max_cpu_units_per_board(void)
 438 {
 439         return (OPL_MAX_CPU_PER_BOARD);
 440 }
 441 
 442 int
 443 plat_max_mem_units_per_board(void)
 444 {
 445         return (OPL_MAX_MEM_UNITS_PER_BOARD);
 446 }
 447 
 448 int
 449 plat_max_io_units_per_board(void)
 450 {
 451         return (OPL_MAX_IO_UNITS_PER_BOARD);
 452 }
 453 
 454 int
 455 plat_max_cmp_units_per_board(void)
 456 {
 457         return (OPL_MAX_CMP_UNITS_PER_BOARD);
 458 }
 459 
 460 int
 461 plat_max_core_units_per_board(void)
 462 {
 463         return (OPL_MAX_CORE_UNITS_PER_BOARD);
 464 }
 465 
 466 int
 467 plat_pfn_to_mem_node(pfn_t pfn)
 468 {
 469         return (pfn >> mem_node_pfn_shift);
 470 }
 471 
 472 /* ARGSUSED */
 473 void
 474 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
 475 {
 476         size_t  elem;
 477         pfn_t   basepfn;
 478         pgcnt_t npgs;
 479         uint64_t        boundary, ssize;
 480         uint64_t        low, high;
 481 
 482         /*
 483          * OPL mem slices are always aligned on a 256GB boundary.
 484          */
 485         mem_node_pfn_shift = OPL_MC_MEMBOARD_SHIFT - MMU_PAGESHIFT;
 486         mem_node_physalign = 0;
 487 
 488         /*
 489          * Boot install lists are arranged <addr, len>, <addr, len>, ...
 490          */
 491         ssize = (1ull << OPL_MC_MEMBOARD_SHIFT);
 492         for (elem = 0; elem < nelems; list++, elem++) {
 493                 low  = list->addr;
 494                 high = low + list->size;
 495                 while (low < high) {
 496                         boundary = roundup(low+1, ssize);
 497                         boundary = MIN(high, boundary);
 498                         basepfn = btop(low);
 499                         npgs = btop(boundary - low);
 500                         mem_node_add_slice(basepfn, basepfn + npgs - 1);
 501                         low = boundary;
 502                 }
 503         }
 504 }
 505 
 506 /*
 507  * Find the CPU associated with a slice at boot-time.
 508  */
 509 void
 510 plat_fill_mc(pnode_t nodeid)
 511 {
 512         int board;
 513         int memnode;
 514         struct {
 515                 uint64_t        addr;
 516                 uint64_t        size;
 517         } mem_range;
 518 
 519         if (prom_getprop(nodeid, "board#", (caddr_t)&board) < 0) {
 520                 panic("Can not find board# property in mc node %x", nodeid);
 521         }
 522         if (prom_getprop(nodeid, "sb-mem-ranges", (caddr_t)&mem_range) < 0) {
 523                 panic("Can not find sb-mem-ranges property in mc node %x",
 524                     nodeid);
 525         }
 526         memnode = mem_range.addr >> OPL_MC_MEMBOARD_SHIFT;
 527         plat_assign_lgrphand_to_mem_node(board, memnode);
 528 }
 529 
 530 /*
 531  * Return the platform handle for the lgroup containing the given CPU
 532  *
 533  * For OPL, lgroup platform handle == board #.
 534  */
 535 
 536 extern int mpo_disabled;
 537 extern lgrp_handle_t lgrp_default_handle;
 538 
 539 lgrp_handle_t
 540 plat_lgrp_cpu_to_hand(processorid_t id)
 541 {
 542         lgrp_handle_t plathand;
 543 
 544         /*
 545          * Return the real platform handle for the CPU until
 546          * such time as we know that MPO should be disabled.
 547          * At that point, we set the "mpo_disabled" flag to true,
 548          * and from that point on, return the default handle.
 549          *
 550          * By the time we know that MPO should be disabled, the
 551          * first CPU will have already been added to a leaf
 552          * lgroup, but that's ok. The common lgroup code will
 553          * double check that the boot CPU is in the correct place,
 554          * and in the case where mpo should be disabled, will move
 555          * it to the root if necessary.
 556          */
 557         if (mpo_disabled) {
 558                 /* If MPO is disabled, return the default (UMA) handle */
 559                 plathand = lgrp_default_handle;
 560         } else
 561                 plathand = (lgrp_handle_t)LSB_ID(id);
 562         return (plathand);
 563 }
 564 
 565 /*
 566  * Platform specific lgroup initialization
 567  */
 568 void
 569 plat_lgrp_init(void)
 570 {
 571         extern uint32_t lgrp_expand_proc_thresh;
 572         extern uint32_t lgrp_expand_proc_diff;
 573         const uint_t m = LGRP_LOADAVG_THREAD_MAX;
 574 
 575         /*
 576          * Set tuneables for the OPL architecture
 577          *
 578          * lgrp_expand_proc_thresh is the threshold load on the set of
 579          * lgroups a process is currently using on before considering
 580          * adding another lgroup to the set.  For Oly-C and Jupiter
 581          * systems, there are four sockets per lgroup. Setting
 582          * lgrp_expand_proc_thresh to add lgroups when the load reaches
 583          * four threads will spread the load when it exceeds one thread
 584          * per socket, optimizing memory bandwidth and L2 cache space.
 585          *
 586          * lgrp_expand_proc_diff determines how much less another lgroup
 587          * must be loaded before shifting the start location of a thread
 588          * to it.
 589          *
 590          * lgrp_loadavg_tolerance is the threshold where two lgroups are
 591          * considered to have different loads.  It is set to be less than
 592          * 1% so that even a small residual load will be considered different
 593          * from no residual load.
 594          *
 595          * We note loadavg values are not precise.
 596          * Every 1/10 of a second loadavg values are reduced by 5%.
 597          * This adjustment can come in the middle of the lgroup selection
 598          * process, and for larger parallel apps with many threads can
 599          * frequently occur between the start of the second thread
 600          * placement and the finish of the last thread placement.
 601          * We also must be careful to not use too small of a threshold
 602          * since the cumulative decay for 1 second idle time is 40%.
 603          * That is, the residual load from completed threads will still
 604          * be 60% one second after the proc goes idle or 8% after 5 seconds.
 605          *
 606          * To allow for lag time in loadavg calculations
 607          * remote thresh = 3.75 * LGRP_LOADAVG_THREAD_MAX
 608          * local thresh  = 0.75 * LGRP_LOADAVG_THREAD_MAX
 609          * tolerance     = 0.0078 * LGRP_LOADAVG_THREAD_MAX
 610          *
 611          * The load placement algorithms consider LGRP_LOADAVG_THREAD_MAX
 612          * as the equivalent of a load of 1. To make the code more compact,
 613          * we set m = LGRP_LOADAVG_THREAD_MAX.
 614          */
 615         lgrp_expand_proc_thresh = (m * 3) + (m >> 1) + (m >> 2);
 616         lgrp_expand_proc_diff = (m >> 1) + (m >> 2);
 617         lgrp_loadavg_tolerance = (m >> 7);
 618 }
 619 
 620 /*
 621  * Platform notification of lgroup (re)configuration changes
 622  */
 623 /*ARGSUSED*/
 624 void
 625 plat_lgrp_config(lgrp_config_flag_t evt, uintptr_t arg)
 626 {
 627         update_membounds_t *umb;
 628         lgrp_config_mem_rename_t lmr;
 629         int sbd, tbd;
 630         lgrp_handle_t hand, shand, thand;
 631         int mnode, snode, tnode;
 632         pfn_t start, end;
 633 
 634         if (mpo_disabled)
 635                 return;
 636 
 637         switch (evt) {
 638 
 639         case LGRP_CONFIG_MEM_ADD:
 640                 /*
 641                  * Establish the lgroup handle to memnode translation.
 642                  */
 643                 umb = (update_membounds_t *)arg;
 644 
 645                 hand = umb->u_board;
 646                 mnode = plat_pfn_to_mem_node(umb->u_base >> MMU_PAGESHIFT);
 647                 plat_assign_lgrphand_to_mem_node(hand, mnode);
 648 
 649                 break;
 650 
 651         case LGRP_CONFIG_MEM_DEL:
 652                 /*
 653                  * Special handling for possible memory holes.
 654                  */
 655                 umb = (update_membounds_t *)arg;
 656                 hand = umb->u_board;
 657                 if ((mnode = plat_lgrphand_to_mem_node(hand)) != -1) {
 658                         if (mem_node_config[mnode].exists) {
 659                                 start = mem_node_config[mnode].physbase;
 660                                 end = mem_node_config[mnode].physmax;
 661                                 mem_node_del_slice(start, end);
 662                         }
 663                 }
 664 
 665                 break;
 666 
 667         case LGRP_CONFIG_MEM_RENAME:
 668                 /*
 669                  * During a DR copy-rename operation, all of the memory
 670                  * on one board is moved to another board -- but the
 671                  * addresses/pfns and memnodes don't change. This means
 672                  * the memory has changed locations without changing identity.
 673                  *
 674                  * Source is where we are copying from and target is where we
 675                  * are copying to.  After source memnode is copied to target
 676                  * memnode, the physical addresses of the target memnode are
 677                  * renamed to match what the source memnode had.  Then target
 678                  * memnode can be removed and source memnode can take its
 679                  * place.
 680                  *
 681                  * To do this, swap the lgroup handle to memnode mappings for
 682                  * the boards, so target lgroup will have source memnode and
 683                  * source lgroup will have empty target memnode which is where
 684                  * its memory will go (if any is added to it later).
 685                  *
 686                  * Then source memnode needs to be removed from its lgroup
 687                  * and added to the target lgroup where the memory was living
 688                  * but under a different name/memnode.  The memory was in the
 689                  * target memnode and now lives in the source memnode with
 690                  * different physical addresses even though it is the same
 691                  * memory.
 692                  */
 693                 sbd = arg & 0xffff;
 694                 tbd = (arg & 0xffff0000) >> 16;
 695                 shand = sbd;
 696                 thand = tbd;
 697                 snode = plat_lgrphand_to_mem_node(shand);
 698                 tnode = plat_lgrphand_to_mem_node(thand);
 699 
 700                 /*
 701                  * Special handling for possible memory holes.
 702                  */
 703                 if (tnode != -1 && mem_node_config[tnode].exists) {
 704                         start = mem_node_config[tnode].physbase;
 705                         end = mem_node_config[tnode].physmax;
 706                         mem_node_del_slice(start, end);
 707                 }
 708 
 709                 plat_assign_lgrphand_to_mem_node(thand, snode);
 710                 plat_assign_lgrphand_to_mem_node(shand, tnode);
 711 
 712                 lmr.lmem_rename_from = shand;
 713                 lmr.lmem_rename_to = thand;
 714 
 715                 /*
 716                  * Remove source memnode of copy rename from its lgroup
 717                  * and add it to its new target lgroup
 718                  */
 719                 lgrp_config(LGRP_CONFIG_MEM_RENAME, (uintptr_t)snode,
 720                     (uintptr_t)&lmr);
 721 
 722                 break;
 723 
 724         default:
 725                 break;
 726         }
 727 }
 728 
 729 /*
 730  * Return latency between "from" and "to" lgroups
 731  *
 732  * This latency number can only be used for relative comparison
 733  * between lgroups on the running system, cannot be used across platforms,
 734  * and may not reflect the actual latency.  It is platform and implementation
 735  * specific, so platform gets to decide its value.  It would be nice if the
 736  * number was at least proportional to make comparisons more meaningful though.
 737  * NOTE: The numbers below are supposed to be load latencies for uncached
 738  * memory divided by 10.
 739  *
 740  */
 741 int
 742 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
 743 {
 744         /*
 745          * Return min remote latency when there are more than two lgroups
 746          * (root and child) and getting latency between two different lgroups
 747          * or root is involved
 748          */
 749         if (lgrp_optimizations() && (from != to ||
 750             from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE))
 751                 return (42);
 752         else
 753                 return (35);
 754 }
 755 
 756 /*
 757  * Return platform handle for root lgroup
 758  */
 759 lgrp_handle_t
 760 plat_lgrp_root_hand(void)
 761 {
 762         if (mpo_disabled)
 763                 return (lgrp_default_handle);
 764 
 765         return (LGRP_DEFAULT_HANDLE);
 766 }
 767 
 768 /*ARGSUSED*/
 769 void
 770 plat_freelist_process(int mnode)
 771 {
 772 }
 773 
 774 void
 775 load_platform_drivers(void)
 776 {
 777         (void) i_ddi_attach_pseudo_node("dr");
 778 }
 779 
 780 /*
 781  * No platform drivers on this platform
 782  */
 783 char *platform_module_list[] = {
 784         (char *)0
 785 };
 786 
 787 /*ARGSUSED*/
 788 void
 789 plat_tod_fault(enum tod_fault_type tod_bad)
 790 {
 791 }
 792 
 793 /*ARGSUSED*/
 794 void
 795 cpu_sgn_update(ushort_t sgn, uchar_t state, uchar_t sub_state, int cpuid)
 796 {
 797         static void (*scf_panic_callback)(int);
 798         static void (*scf_shutdown_callback)(int);
 799 
 800         /*
 801          * This is for notifing system panic/shutdown to SCF.
 802          * In case of shutdown and panic, SCF call back
 803          * function should be called.
 804          *  <SCF call back functions>
 805          *   scf_panic_callb()   : panicsys()->panic_quiesce_hw()
 806          *   scf_shutdown_callb(): halt() or power_down() or reboot_machine()
 807          * cpuid should be -1 and state should be SIGST_EXIT.
 808          */
 809         if (state == SIGST_EXIT && cpuid == -1) {
 810 
 811                 /*
 812                  * find the symbol for the SCF panic callback routine in driver
 813                  */
 814                 if (scf_panic_callback == NULL)
 815                         scf_panic_callback = (void (*)(int))
 816                             modgetsymvalue("scf_panic_callb", 0);
 817                 if (scf_shutdown_callback == NULL)
 818                         scf_shutdown_callback = (void (*)(int))
 819                             modgetsymvalue("scf_shutdown_callb", 0);
 820 
 821                 switch (sub_state) {
 822                 case SIGSUBST_PANIC:
 823                         if (scf_panic_callback == NULL) {
 824                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 825                                     "scf_panic_callb not found\n");
 826                                 return;
 827                         }
 828                         scf_panic_callback(SIGSUBST_PANIC);
 829                         break;
 830 
 831                 case SIGSUBST_HALT:
 832                         if (scf_shutdown_callback == NULL) {
 833                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 834                                     "scf_shutdown_callb not found\n");
 835                                 return;
 836                         }
 837                         scf_shutdown_callback(SIGSUBST_HALT);
 838                         break;
 839 
 840                 case SIGSUBST_ENVIRON:
 841                         if (scf_shutdown_callback == NULL) {
 842                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 843                                     "scf_shutdown_callb not found\n");
 844                                 return;
 845                         }
 846                         scf_shutdown_callback(SIGSUBST_ENVIRON);
 847                         break;
 848 
 849                 case SIGSUBST_REBOOT:
 850                         if (scf_shutdown_callback == NULL) {
 851                                 cmn_err(CE_NOTE, "!cpu_sgn_update: "
 852                                     "scf_shutdown_callb not found\n");
 853                                 return;
 854                         }
 855                         scf_shutdown_callback(SIGSUBST_REBOOT);
 856                         break;
 857                 }
 858         }
 859 }
 860 
 861 /*ARGSUSED*/
 862 int
 863 plat_get_mem_unum(int synd_code, uint64_t flt_addr, int flt_bus_id,
 864         int flt_in_memory, ushort_t flt_status,
 865         char *buf, int buflen, int *lenp)
 866 {
 867         /*
 868          * check if it's a Memory error.
 869          */
 870         if (flt_in_memory) {
 871                 if (opl_get_mem_unum != NULL) {
 872                         return (opl_get_mem_unum(synd_code, flt_addr, buf,
 873                             buflen, lenp));
 874                 } else {
 875                         return (ENOTSUP);
 876                 }
 877         } else {
 878                 return (ENOTSUP);
 879         }
 880 }
 881 
 882 /*ARGSUSED*/
 883 int
 884 plat_get_cpu_unum(int cpuid, char *buf, int buflen, int *lenp)
 885 {
 886         int     ret = 0;
 887         int     sb;
 888         int     plen;
 889 
 890         sb = opl_get_physical_board(LSB_ID(cpuid));
 891         if (sb == -1) {
 892                 return (ENXIO);
 893         }
 894 
 895         /*
 896          * opl_cur_model is assigned here
 897          */
 898         if (opl_cur_model == NULL) {
 899                 set_model_info();
 900 
 901                 /*
 902                  * if not matched, return
 903                  */
 904                 if (opl_cur_model == NULL)
 905                         return (ENODEV);
 906         }
 907 
 908         ASSERT((opl_cur_model - opl_models) == (opl_cur_model->model_type));
 909 
 910         switch (opl_cur_model->model_type) {
 911         case FF1:
 912                 plen = snprintf(buf, buflen, "/%s/CPUM%d", "MBU_A",
 913                     CHIP_ID(cpuid) / 2);
 914                 break;
 915 
 916         case FF2:
 917                 plen = snprintf(buf, buflen, "/%s/CPUM%d", "MBU_B",
 918                     (CHIP_ID(cpuid) / 2) + (sb * 2));
 919                 break;
 920 
 921         case DC1:
 922         case DC2:
 923         case DC3:
 924                 plen = snprintf(buf, buflen, "/%s%02d/CPUM%d", "CMU", sb,
 925                     CHIP_ID(cpuid));
 926                 break;
 927 
 928         case IKKAKU:
 929                 plen = snprintf(buf, buflen, "/%s", "MBU_A");
 930                 break;
 931 
 932         default:
 933                 /* This should never happen */
 934                 return (ENODEV);
 935         }
 936 
 937         if (plen >= buflen) {
 938                 ret = ENOSPC;
 939         } else {
 940                 if (lenp)
 941                         *lenp = strlen(buf);
 942         }
 943         return (ret);
 944 }
 945 
 946 void
 947 plat_nodename_set(void)
 948 {
 949         post_xscf_msg((char *)&utsname, sizeof (struct utsname));
 950 }
 951 
 952 caddr_t efcode_vaddr = NULL;
 953 
 954 /*
 955  * Preallocate enough memory for fcode claims.
 956  */
 957 
 958 caddr_t
 959 efcode_alloc(caddr_t alloc_base)
 960 {
 961         caddr_t efcode_alloc_base = (caddr_t)roundup((uintptr_t)alloc_base,
 962             MMU_PAGESIZE);
 963         caddr_t vaddr;
 964 
 965         /*
 966          * allocate the physical memory for the Oberon fcode.
 967          */
 968         if ((vaddr = (caddr_t)BOP_ALLOC(bootops, efcode_alloc_base,
 969             efcode_size, MMU_PAGESIZE)) == NULL)
 970                 cmn_err(CE_PANIC, "Cannot allocate Efcode Memory");
 971 
 972         efcode_vaddr = vaddr;
 973 
 974         return (efcode_alloc_base + efcode_size);
 975 }
 976 
 977 caddr_t
 978 plat_startup_memlist(caddr_t alloc_base)
 979 {
 980         caddr_t tmp_alloc_base;
 981 
 982         tmp_alloc_base = efcode_alloc(alloc_base);
 983         tmp_alloc_base =
 984             (caddr_t)roundup((uintptr_t)tmp_alloc_base, ecache_alignsize);
 985         return (tmp_alloc_base);
 986 }
 987 
 988 /* need to forward declare these */
 989 static void plat_lock_delay(uint_t);
 990 
 991 void
 992 startup_platform(void)
 993 {
 994         if (clock_tick_threshold == 0)
 995                 clock_tick_threshold = OPL_CLOCK_TICK_THRESHOLD;
 996         if (clock_tick_ncpus == 0)
 997                 clock_tick_ncpus = OPL_CLOCK_TICK_NCPUS;
 998         mutex_lock_delay = plat_lock_delay;
 999         mutex_cap_factor = OPL_BOFF_MAX_SCALE;
1000 }
1001 
1002 static uint_t
1003 get_mmu_id(processorid_t cpuid)
1004 {
1005         int pb = opl_get_physical_board(LSB_ID(cpuid));
1006 
1007         if (pb == -1) {
1008                 cmn_err(CE_PANIC,
1009                     "opl_get_physical_board failed (cpu %d LSB %u)",
1010                     cpuid, LSB_ID(cpuid));
1011         }
1012         return (pb * OPL_MAX_COREID_PER_BOARD) + (CHIP_ID(cpuid) *
1013             OPL_MAX_COREID_PER_CMP) + CORE_ID(cpuid);
1014 }
1015 
1016 void
1017 plat_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *info)
1018 {
1019         int     impl;
1020 
1021         impl = cpunodes[cpuid].implementation;
1022         if (IS_OLYMPUS_C(impl) || IS_JUPITER(impl)) {
1023                 info->mmu_idx = get_mmu_id(cpuid);
1024                 info->mmu_nctxs = 8192;
1025         } else {
1026                 cmn_err(CE_PANIC, "Unknown processor %d", impl);
1027         }
1028 }
1029 
1030 int
1031 plat_get_mem_sid(char *unum, char *buf, int buflen, int *lenp)
1032 {
1033         if (opl_get_mem_sid == NULL) {
1034                 return (ENOTSUP);
1035         }
1036         return (opl_get_mem_sid(unum, buf, buflen, lenp));
1037 }
1038 
1039 int
1040 plat_get_mem_offset(uint64_t paddr, uint64_t *offp)
1041 {
1042         if (opl_get_mem_offset == NULL) {
1043                 return (ENOTSUP);
1044         }
1045         return (opl_get_mem_offset(paddr, offp));
1046 }
1047 
1048 int
1049 plat_get_mem_addr(char *unum, char *sid, uint64_t offset, uint64_t *addrp)
1050 {
1051         if (opl_get_mem_addr == NULL) {
1052                 return (ENOTSUP);
1053         }
1054         return (opl_get_mem_addr(unum, sid, offset, addrp));
1055 }
1056 
1057 void
1058 plat_lock_delay(uint_t backoff)
1059 {
1060         int i;
1061         uint_t cnt, remcnt;
1062         int ctr;
1063         hrtime_t delay_start, rem_delay;
1064         /*
1065          * Platform specific lock delay code for OPL
1066          *
1067          * Using staged linear increases in the delay.
1068          * The sleep instruction is the preferred method of delay,
1069          * but is too large of granularity for the initial backoff.
1070          */
1071 
1072         if (backoff < 100) {
1073                 /*
1074                  * If desired backoff is long enough,
1075                  * use sleep for most of it
1076                  */
1077                 for (cnt = backoff;
1078                     cnt >= OPL_BOFF_SLEEP;
1079                     cnt -= OPL_BOFF_SLEEP) {
1080                         cpu_smt_pause();
1081                 }
1082                 /*
1083                  * spin for small remainder of backoff
1084                  */
1085                 for (ctr = cnt * OPL_BOFF_SPIN; ctr; ctr--) {
1086                         mutex_delay_default();
1087                 }
1088         } else {
1089                 /* backoff is large.  Fill it by sleeping */
1090                 delay_start = gethrtime_waitfree();
1091                 cnt = backoff / OPL_BOFF_SLEEP;
1092                 /*
1093                  * use sleep instructions for delay
1094                  */
1095                 for (i = 0; i < cnt; i++) {
1096                         cpu_smt_pause();
1097                 }
1098 
1099                 /*
1100                  * Note: if the other strand executes a sleep instruction,
1101                  * then the sleep ends immediately with a minimum time of
1102                  * 42 clocks.  We check gethrtime to insure we have
1103                  * waited long enough.  And we include both a short
1104                  * spin loop and a sleep for repeated delay times.
1105                  */
1106 
1107                 rem_delay = gethrtime_waitfree() - delay_start;
1108                 while (rem_delay < cnt * OPL_BOFF_TM) {
1109                         remcnt = cnt - (rem_delay / OPL_BOFF_TM);
1110                         for (i = 0; i < remcnt; i++) {
1111                                 cpu_smt_pause();
1112                                 for (ctr = OPL_BOFF_SPIN; ctr; ctr--) {
1113                                         mutex_delay_default();
1114                                 }
1115                         }
1116                         rem_delay = gethrtime_waitfree() - delay_start;
1117                 }
1118         }
1119 }
1120 
1121 /*
1122  * The following code implements asynchronous call to XSCF to setup the
1123  * domain node name.
1124  */
1125 
1126 #define FREE_MSG(m)             kmem_free((m), NM_LEN((m)->len))
1127 
1128 /*
1129  * The following three macros define the all operations on the request
1130  * list we are using here, and hide the details of the list
1131  * implementation from the code.
1132  */
1133 #define PUSH(m) \
1134         { \
1135                 (m)->next = ctl_msg.head; \
1136                 (m)->prev = NULL; \
1137                 if ((m)->next != NULL) \
1138                         (m)->next->prev = (m); \
1139                 ctl_msg.head = (m); \
1140         }
1141 
1142 #define REMOVE(m) \
1143         { \
1144                 if ((m)->prev != NULL) \
1145                         (m)->prev->next = (m)->next; \
1146                 else \
1147                         ctl_msg.head = (m)->next; \
1148                 if ((m)->next != NULL) \
1149                         (m)->next->prev = (m)->prev; \
1150         }
1151 
1152 #define FREE_THE_TAIL(head) \
1153         { \
1154                 nm_msg_t *n_msg, *m; \
1155                 m = (head)->next; \
1156                 (head)->next = NULL; \
1157                 while (m != NULL) { \
1158                         n_msg = m->next; \
1159                         FREE_MSG(m); \
1160                         m = n_msg; \
1161                 } \
1162         }
1163 
1164 #define SCF_PUTINFO(f, s, p) \
1165         f(KEY_ESCF, 0x01, 0, s, p)
1166 
1167 #define PASS2XSCF(m, r) ((r = SCF_PUTINFO(ctl_msg.scf_service_function, \
1168                                             (m)->len, (m)->data)) == 0)
1169 
1170 /*
1171  * The value of the following macro loosely depends on the
1172  * value of the "device busy" timeout used in the SCF driver.
1173  * (See pass2xscf_thread()).
1174  */
1175 #define SCF_DEVBUSY_DELAY       10
1176 
1177 /*
1178  * The default number of attempts to contact the scf driver
1179  * if we cannot fetch any information about the timeout value
1180  * it uses.
1181  */
1182 
1183 #define REPEATS         4
1184 
1185 typedef struct nm_msg {
1186         struct nm_msg *next;
1187         struct nm_msg *prev;
1188         int len;
1189         char data[1];
1190 } nm_msg_t;
1191 
1192 #define NM_LEN(len)             (sizeof (nm_msg_t) + (len) - 1)
1193 
1194 static struct ctlmsg {
1195         nm_msg_t        *head;
1196         nm_msg_t        *now_serving;
1197         kmutex_t        nm_lock;
1198         kthread_t       *nmt;
1199         int             cnt;
1200         int (*scf_service_function)(uint32_t, uint8_t,
1201                                     uint32_t, uint32_t, void *);
1202 } ctl_msg;
1203 
1204 static void
1205 post_xscf_msg(char *dp, int len)
1206 {
1207         nm_msg_t *msg;
1208 
1209         msg = (nm_msg_t *)kmem_zalloc(NM_LEN(len), KM_SLEEP);
1210 
1211         bcopy(dp, msg->data, len);
1212         msg->len = len;
1213 
1214         mutex_enter(&ctl_msg.nm_lock);
1215         if (ctl_msg.nmt == NULL) {
1216                 ctl_msg.nmt =  thread_create(NULL, 0, pass2xscf_thread,
1217                     NULL, 0, &p0, TS_RUN, minclsyspri);
1218         }
1219 
1220         PUSH(msg);
1221         ctl_msg.cnt++;
1222         mutex_exit(&ctl_msg.nm_lock);
1223 }
1224 
1225 static void
1226 pass2xscf_thread()
1227 {
1228         nm_msg_t *msg;
1229         int ret;
1230         uint_t i, msg_sent, xscf_driver_delay;
1231         static uint_t repeat_cnt;
1232         uint_t *scf_wait_cnt;
1233 
1234         mutex_enter(&ctl_msg.nm_lock);
1235 
1236         /*
1237          * Find the address of the SCF put routine if it's not done yet.
1238          */
1239         if (ctl_msg.scf_service_function == NULL) {
1240                 if ((ctl_msg.scf_service_function =
1241                     (int (*)(uint32_t, uint8_t, uint32_t, uint32_t, void *))
1242                     modgetsymvalue("scf_service_putinfo", 0)) == NULL) {
1243                         cmn_err(CE_NOTE, "pass2xscf_thread: "
1244                             "scf_service_putinfo not found\n");
1245                         ctl_msg.nmt = NULL;
1246                         mutex_exit(&ctl_msg.nm_lock);
1247                         return;
1248                 }
1249         }
1250 
1251         /*
1252          * Calculate the number of attempts to connect XSCF based on the
1253          * scf driver delay (which is
1254          * SCF_DEVBUSY_DELAY*scf_online_wait_rcnt seconds) and the value
1255          * of xscf_connect_delay (the total number of seconds to wait
1256          * till xscf get ready.)
1257          */
1258         if (repeat_cnt == 0) {
1259                 if ((scf_wait_cnt =
1260                     (uint_t *)
1261                     modgetsymvalue("scf_online_wait_rcnt", 0)) == NULL) {
1262                         repeat_cnt = REPEATS;
1263                 } else {
1264 
1265                         xscf_driver_delay = *scf_wait_cnt *
1266                             SCF_DEVBUSY_DELAY;
1267                         repeat_cnt = (xscf_connect_delay/xscf_driver_delay) + 1;
1268                 }
1269         }
1270 
1271         while (ctl_msg.cnt != 0) {
1272 
1273                 /*
1274                  * Take the very last request from the queue,
1275                  */
1276                 ctl_msg.now_serving = ctl_msg.head;
1277                 ASSERT(ctl_msg.now_serving != NULL);
1278 
1279                 /*
1280                  * and discard all the others if any.
1281                  */
1282                 FREE_THE_TAIL(ctl_msg.now_serving);
1283                 ctl_msg.cnt = 1;
1284                 mutex_exit(&ctl_msg.nm_lock);
1285 
1286                 /*
1287                  * Pass the name to XSCF. Note please, we do not hold the
1288                  * mutex while we are doing this.
1289                  */
1290                 msg_sent = 0;
1291                 for (i = 0; i < repeat_cnt; i++) {
1292                         if (PASS2XSCF(ctl_msg.now_serving, ret)) {
1293                                 msg_sent = 1;
1294                                 break;
1295                         } else {
1296                                 if (ret != EBUSY) {
1297                                         cmn_err(CE_NOTE, "pass2xscf_thread:"
1298                                             " unexpected return code"
1299                                             " from scf_service_putinfo():"
1300                                             " %d\n", ret);
1301                                 }
1302                         }
1303                 }
1304 
1305                 if (msg_sent) {
1306 
1307                         /*
1308                          * Remove the request from the list
1309                          */
1310                         mutex_enter(&ctl_msg.nm_lock);
1311                         msg = ctl_msg.now_serving;
1312                         ctl_msg.now_serving = NULL;
1313                         REMOVE(msg);
1314                         ctl_msg.cnt--;
1315                         mutex_exit(&ctl_msg.nm_lock);
1316                         FREE_MSG(msg);
1317                 } else {
1318 
1319                         /*
1320                          * If while we have tried to communicate with
1321                          * XSCF there were any other requests we are
1322                          * going to drop this one and take the latest
1323                          * one.  Otherwise we will try to pass this one
1324                          * again.
1325                          */
1326                         cmn_err(CE_NOTE,
1327                             "pass2xscf_thread: "
1328                             "scf_service_putinfo "
1329                             "not responding\n");
1330                 }
1331                 mutex_enter(&ctl_msg.nm_lock);
1332         }
1333 
1334         /*
1335          * The request queue is empty, exit.
1336          */
1337         ctl_msg.nmt = NULL;
1338         mutex_exit(&ctl_msg.nm_lock);
1339 }