2 Wdiff usr/src/uts/i86pc/os/machdep.c

Print this page

9059 Simplify SMAP relocations with krtld
Portions contributed by: John Levon <john.levon@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/machdep.c
          +++ new/usr/src/uts/i86pc/os/machdep.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright 2017, Joyent, Inc.
       24 + * Copyright 2020 Joyent, Inc.
  25   25   */
  26   26  /*
  27   27   * Copyright (c) 2010, Intel Corporation.
  28   28   * All rights reserved.
  29   29   */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/t_lock.h>
  33   33  #include <sys/param.h>
  34   34  #include <sys/segments.h>

  35   35  #include <sys/sysmacros.h>
  36   36  #include <sys/signal.h>
  37   37  #include <sys/systm.h>
  38   38  #include <sys/user.h>
  39   39  #include <sys/mman.h>
  40   40  #include <sys/vm.h>
  41   41  
  42   42  #include <sys/disp.h>
  43   43  #include <sys/class.h>
  44   44  
  45   45  #include <sys/proc.h>
  46   46  #include <sys/buf.h>
  47   47  #include <sys/kmem.h>
  48   48  
  49   49  #include <sys/reboot.h>
  50   50  #include <sys/uadmin.h>
  51   51  #include <sys/callb.h>
  52   52  
  53   53  #include <sys/cred.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/file.h>
  56   56  
  57   57  #include <sys/procfs.h>
  58   58  #include <sys/acct.h>
  59   59  
  60   60  #include <sys/vfs.h>
  61   61  #include <sys/dnlc.h>
  62   62  #include <sys/var.h>
  63   63  #include <sys/cmn_err.h>
  64   64  #include <sys/utsname.h>
  65   65  #include <sys/debug.h>
  66   66  
  67   67  #include <sys/dumphdr.h>
  68   68  #include <sys/bootconf.h>
  69   69  #include <sys/varargs.h>
  70   70  #include <sys/promif.h>
  71   71  #include <sys/modctl.h>
  72   72  
  73   73  #include <sys/consdev.h>
  74   74  #include <sys/frame.h>
  75   75  
  76   76  #include <sys/sunddi.h>
  77   77  #include <sys/ddidmareq.h>
  78   78  #include <sys/psw.h>
  79   79  #include <sys/regset.h>
  80   80  #include <sys/privregs.h>
  81   81  #include <sys/clock.h>
  82   82  #include <sys/tss.h>
  83   83  #include <sys/cpu.h>
  84   84  #include <sys/stack.h>
  85   85  #include <sys/trap.h>
  86   86  #include <sys/pic.h>
  87   87  #include <vm/hat.h>
  88   88  #include <vm/anon.h>
  89   89  #include <vm/as.h>
  90   90  #include <vm/page.h>
  91   91  #include <vm/seg.h>
  92   92  #include <vm/seg_kmem.h>
  93   93  #include <vm/seg_map.h>
  94   94  #include <vm/seg_vn.h>
  95   95  #include <vm/seg_kp.h>
  96   96  #include <vm/hat_i86.h>
  97   97  #include <sys/swap.h>
  98   98  #include <sys/thread.h>
  99   99  #include <sys/sysconf.h>
 100  100  #include <sys/vm_machparam.h>
 101  101  #include <sys/archsystm.h>
 102  102  #include <sys/machsystm.h>
 103  103  #include <sys/machlock.h>
 104  104  #include <sys/x_call.h>
 105  105  #include <sys/instance.h>
 106  106  
 107  107  #include <sys/time.h>
 108  108  #include <sys/smp_impldefs.h>
 109  109  #include <sys/psm_types.h>
 110  110  #include <sys/atomic.h>
 111  111  #include <sys/panic.h>
 112  112  #include <sys/cpuvar.h>
 113  113  #include <sys/dtrace.h>
 114  114  #include <sys/bl.h>
 115  115  #include <sys/nvpair.h>
 116  116  #include <sys/x86_archext.h>
 117  117  #include <sys/pool_pset.h>
 118  118  #include <sys/autoconf.h>
 119  119  #include <sys/mem.h>
 120  120  #include <sys/dumphdr.h>
 121  121  #include <sys/compress.h>
 122  122  #include <sys/cpu_module.h>
 123  123  #if defined(__xpv)
 124  124  #include <sys/hypervisor.h>
 125  125  #include <sys/xpv_panic.h>
 126  126  #endif
 127  127  
 128  128  #include <sys/fastboot.h>
 129  129  #include <sys/machelf.h>
 130  130  #include <sys/kobj.h>
 131  131  #include <sys/multiboot.h>
 132  132  
 133  133  #ifdef  TRAPTRACE
 134  134  #include <sys/traptrace.h>
 135  135  #endif  /* TRAPTRACE */
 136  136  
 137  137  #include <c2/audit.h>
 138  138  #include <sys/clock_impl.h>
 139  139  
 140  140  extern void audit_enterprom(int);
 141  141  extern void audit_exitprom(int);
 142  142  
 143  143  /*
 144  144   * Tunable to enable apix PSM; if set to 0, pcplusmp PSM will be used.
 145  145   */
 146  146  int     apix_enable = 1;
 147  147  
 148  148  int     apic_nvidia_io_max = 0; /* no. of NVIDIA i/o apics */
 149  149  
 150  150  /*
 151  151   * Occassionally the kernel knows better whether to power-off or reboot.
 152  152   */
 153  153  int force_shutdown_method = AD_UNKNOWN;
 154  154  
 155  155  /*
 156  156   * The panicbuf array is used to record messages and state:
 157  157   */
 158  158  char panicbuf[PANICBUFSIZE];
 159  159  
 160  160  /*
 161  161   * Flags to control Dynamic Reconfiguration features.
 162  162   */
 163  163  uint64_t plat_dr_options;
 164  164  
 165  165  /*
 166  166   * Maximum physical address for memory DR operations.
 167  167   */
 168  168  uint64_t plat_dr_physmax;
 169  169  
 170  170  /*
 171  171   * maxphys - used during physio
 172  172   * klustsize - used for klustering by swapfs and specfs
 173  173   */
 174  174  int maxphys = 56 * 1024;    /* XXX See vm_subr.c - max b_count in physio */
 175  175  int klustsize = 56 * 1024;
 176  176  
 177  177  caddr_t p0_va;          /* Virtual address for accessing physical page 0 */
 178  178  
 179  179  /*
 180  180   * defined here, though unused on x86,
 181  181   * to make kstat_fr.c happy.
 182  182   */

↓ open down ↓

148 lines elided

↑ open up ↑

 183  183  int vac;
 184  184  
 185  185  void debug_enter(char *);
 186  186  
 187  187  extern void pm_cfb_check_and_powerup(void);
 188  188  extern void pm_cfb_rele(void);
 189  189  
 190  190  extern fastboot_info_t newkernel;
 191  191  
 192  192  /*
      193 + * Instructions to enable or disable SMAP, respectively.
      194 + */
      195 +static const uint8_t clac_instr[3] = { 0x0f, 0x01, 0xca };
      196 +static const uint8_t stac_instr[3] = { 0x0f, 0x01, 0xcb };
      197 +
      198 +/*
 193  199   * Machine dependent code to reboot.
 194  200   * "mdep" is interpreted as a character pointer; if non-null, it is a pointer
 195  201   * to a string to be used as the argument string when rebooting.
 196  202   *
 197  203   * "invoke_cb" is a boolean. It is set to true when mdboot() can safely
 198  204   * invoke CB_CL_MDBOOT callbacks before shutting the system down, i.e. when
 199  205   * we are in a normal shutdown sequence (interrupts are not blocked, the
 200  206   * system is not panic'ing or being suspended).
 201  207   */
 202  208  /*ARGSUSED*/

 203  209  void
 204  210  mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb)
 205  211  {
 206  212          processorid_t bootcpuid = 0;
 207  213          static int is_first_quiesce = 1;
 208  214          static int is_first_reset = 1;
 209  215          int reset_status = 0;
 210  216          static char fallback_str[] = "Falling back to regular reboot.\n";
 211  217  
 212  218          if (fcn == AD_FASTREBOOT && !newkernel.fi_valid)
 213  219                  fcn = AD_BOOT;
 214  220  
 215  221          if (!panicstr) {
 216  222                  kpreempt_disable();
 217  223                  if (fcn == AD_FASTREBOOT) {
 218  224                          mutex_enter(&cpu_lock);
 219  225                          if (CPU_ACTIVE(cpu_get(bootcpuid))) {
 220  226                                  affinity_set(bootcpuid);
 221  227                          }
 222  228                          mutex_exit(&cpu_lock);
 223  229                  } else {
 224  230                          affinity_set(CPU_CURRENT);
 225  231                  }
 226  232          }
 227  233  
 228  234          if (force_shutdown_method != AD_UNKNOWN)
 229  235                  fcn = force_shutdown_method;
 230  236  
 231  237          /*
 232  238           * XXX - rconsvp is set to NULL to ensure that output messages
 233  239           * are sent to the underlying "hardware" device using the
 234  240           * monitor's printf routine since we are in the process of
 235  241           * either rebooting or halting the machine.
 236  242           */
 237  243          rconsvp = NULL;
 238  244  
 239  245          /*
 240  246           * Print the reboot message now, before pausing other cpus.
 241  247           * There is a race condition in the printing support that
 242  248           * can deadlock multiprocessor machines.
 243  249           */
 244  250          if (!(fcn == AD_HALT || fcn == AD_POWEROFF))
 245  251                  prom_printf("rebooting...\n");
 246  252  
 247  253          if (IN_XPV_PANIC())
 248  254                  reset();
 249  255  
 250  256          /*
 251  257           * We can't bring up the console from above lock level, so do it now
 252  258           */
 253  259          pm_cfb_check_and_powerup();
 254  260  
 255  261          /* make sure there are no more changes to the device tree */
 256  262          devtree_freeze();
 257  263  
 258  264          if (invoke_cb)
 259  265                  (void) callb_execute_class(CB_CL_MDBOOT, 0);
 260  266  
 261  267          /*
 262  268           * Clear any unresolved UEs from memory.
 263  269           */
 264  270          page_retire_mdboot();
 265  271  
 266  272  #if defined(__xpv)
 267  273          /*
 268  274           * XXPV Should probably think some more about how we deal
 269  275           *      with panicing before it's really safe to panic.
 270  276           *      On hypervisors, we reboot very quickly..  Perhaps panic
 271  277           *      should only attempt to recover by rebooting if,
 272  278           *      say, we were able to mount the root filesystem,
 273  279           *      or if we successfully launched init(1m).
 274  280           */
 275  281          if (panicstr && proc_init == NULL)
 276  282                  (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 277  283  #endif
 278  284          /*
 279  285           * stop other cpus and raise our priority.  since there is only
 280  286           * one active cpu after this, and our priority will be too high
 281  287           * for us to be preempted, we're essentially single threaded
 282  288           * from here on out.
 283  289           */
 284  290          (void) spl6();
 285  291          if (!panicstr) {
 286  292                  mutex_enter(&cpu_lock);
 287  293                  pause_cpus(NULL, NULL);
 288  294                  mutex_exit(&cpu_lock);
 289  295          }
 290  296  
 291  297          /*
 292  298           * If the system is panicking, the preloaded kernel is valid, and
 293  299           * fastreboot_onpanic has been set, and the system has been up for
 294  300           * longer than fastreboot_onpanic_uptime (default to 10 minutes),
 295  301           * choose Fast Reboot.
 296  302           */
 297  303          if (fcn == AD_BOOT && panicstr && newkernel.fi_valid &&
 298  304              fastreboot_onpanic &&
 299  305              (panic_lbolt - lbolt_at_boot) > fastreboot_onpanic_uptime) {
 300  306                  fcn = AD_FASTREBOOT;
 301  307          }
 302  308  
 303  309          /*
 304  310           * Try to quiesce devices.
 305  311           */
 306  312          if (is_first_quiesce) {
 307  313                  /*
 308  314                   * Clear is_first_quiesce before calling quiesce_devices()
 309  315                   * so that if quiesce_devices() causes panics, it will not
 310  316                   * be invoked again.
 311  317                   */
 312  318                  is_first_quiesce = 0;
 313  319  
 314  320                  quiesce_active = 1;
 315  321                  quiesce_devices(ddi_root_node(), &reset_status);
 316  322                  if (reset_status == -1) {
 317  323                          if (fcn == AD_FASTREBOOT && !force_fastreboot) {
 318  324                                  prom_printf("Driver(s) not capable of fast "
 319  325                                      "reboot.\n");
 320  326                                  prom_printf(fallback_str);
 321  327                                  fastreboot_capable = 0;
 322  328                                  fcn = AD_BOOT;
 323  329                          } else if (fcn != AD_FASTREBOOT)
 324  330                                  fastreboot_capable = 0;
 325  331                  }
 326  332                  quiesce_active = 0;
 327  333          }
 328  334  
 329  335          /*
 330  336           * Try to reset devices. reset_leaves() should only be called
 331  337           * a) when there are no other threads that could be accessing devices,
 332  338           *    and
 333  339           * b) on a system that's not capable of fast reboot (fastreboot_capable
 334  340           *    being 0), or on a system where quiesce_devices() failed to
 335  341           *    complete (quiesce_active being 1).
 336  342           */
 337  343          if (is_first_reset && (!fastreboot_capable || quiesce_active)) {
 338  344                  /*
 339  345                   * Clear is_first_reset before calling reset_devices()
 340  346                   * so that if reset_devices() causes panics, it will not
 341  347                   * be invoked again.
 342  348                   */
 343  349                  is_first_reset = 0;
 344  350                  reset_leaves();
 345  351          }
 346  352  
 347  353          /* Verify newkernel checksum */
 348  354          if (fastreboot_capable && fcn == AD_FASTREBOOT &&
 349  355              fastboot_cksum_verify(&newkernel) != 0) {
 350  356                  fastreboot_capable = 0;
 351  357                  prom_printf("Fast reboot: checksum failed for the new "
 352  358                      "kernel.\n");
 353  359                  prom_printf(fallback_str);
 354  360          }
 355  361  
 356  362          (void) spl8();
 357  363  
 358  364          if (fastreboot_capable && fcn == AD_FASTREBOOT) {
 359  365                  /*
 360  366                   * psm_shutdown is called within fast_reboot()
 361  367                   */
 362  368                  fast_reboot();
 363  369          } else {
 364  370                  (*psm_shutdownf)(cmd, fcn);
 365  371  
 366  372                  if (fcn == AD_HALT || fcn == AD_POWEROFF)
 367  373                          halt((char *)NULL);
 368  374                  else
 369  375                          prom_reboot("");
 370  376          }
 371  377          /*NOTREACHED*/
 372  378  }
 373  379  
 374  380  /* mdpreboot - may be called prior to mdboot while root fs still mounted */
 375  381  /*ARGSUSED*/
 376  382  void
 377  383  mdpreboot(int cmd, int fcn, char *mdep)
 378  384  {
 379  385          if (fcn == AD_FASTREBOOT && !fastreboot_capable) {
 380  386                  fcn = AD_BOOT;
 381  387  #ifdef  __xpv
 382  388                  cmn_err(CE_WARN, "Fast reboot is not supported on xVM");
 383  389  #else
 384  390                  cmn_err(CE_WARN,
 385  391                      "Fast reboot is not supported on this platform%s",
 386  392                      fastreboot_nosup_message());
 387  393  #endif
 388  394          }
 389  395  
 390  396          if (fcn == AD_FASTREBOOT) {
 391  397                  fastboot_load_kernel(mdep);
 392  398                  if (!newkernel.fi_valid)
 393  399                          fcn = AD_BOOT;
 394  400          }
 395  401  
 396  402          (*psm_preshutdownf)(cmd, fcn);
 397  403  }
 398  404  
 399  405  static void
 400  406  stop_other_cpus(void)
 401  407  {
 402  408          ulong_t s = clear_int_flag(); /* fast way to keep CPU from changing */
 403  409          cpuset_t xcset;
 404  410  
 405  411          CPUSET_ALL_BUT(xcset, CPU->cpu_id);
 406  412          xc_priority(0, 0, 0, CPUSET2BV(xcset), mach_cpu_halt);
 407  413          restore_int_flag(s);
 408  414  }
 409  415  
 410  416  /*
 411  417   *      Machine dependent abort sequence handling
 412  418   */
 413  419  void
 414  420  abort_sequence_enter(char *msg)
 415  421  {
 416  422          if (abort_enable == 0) {
 417  423                  if (AU_ZONE_AUDITING(GET_KCTX_GZ))
 418  424                          audit_enterprom(0);
 419  425                  return;
 420  426          }
 421  427          if (AU_ZONE_AUDITING(GET_KCTX_GZ))
 422  428                  audit_enterprom(1);
 423  429          debug_enter(msg);
 424  430          if (AU_ZONE_AUDITING(GET_KCTX_GZ))
 425  431                  audit_exitprom(1);
 426  432  }
 427  433  
 428  434  /*
 429  435   * Enter debugger.  Called when the user types ctrl-alt-d or whenever
 430  436   * code wants to enter the debugger and possibly resume later.
 431  437   *
 432  438   * msg: message to print, possibly NULL.
 433  439   */
 434  440  void
 435  441  debug_enter(char *msg)
 436  442  {
 437  443          if (dtrace_debugger_init != NULL)
 438  444                  (*dtrace_debugger_init)();
 439  445  
 440  446          if (msg != NULL || (boothowto & RB_DEBUG))
 441  447                  prom_printf("\n");
 442  448  
 443  449          if (msg != NULL)
 444  450                  prom_printf("%s\n", msg);
 445  451  
 446  452          if (boothowto & RB_DEBUG)
 447  453                  kmdb_enter();
 448  454  
 449  455          if (dtrace_debugger_fini != NULL)
 450  456                  (*dtrace_debugger_fini)();
 451  457  }
 452  458  
 453  459  void
 454  460  reset(void)
 455  461  {
 456  462          extern  void acpi_reset_system();
 457  463  #if !defined(__xpv)
 458  464          ushort_t *bios_memchk;
 459  465  
 460  466          /*
 461  467           * Can't use psm_map_phys or acpi_reset_system before the hat is
 462  468           * initialized.
 463  469           */
 464  470          if (khat_running) {
 465  471                  bios_memchk = (ushort_t *)psm_map_phys(0x472,
 466  472                      sizeof (ushort_t), PROT_READ | PROT_WRITE);
 467  473                  if (bios_memchk)
 468  474                          *bios_memchk = 0x1234;  /* bios memory check disable */
 469  475  
 470  476                  if (options_dip != NULL &&
 471  477                      ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), 0,
 472  478                      "efi-systab")) {
 473  479                          if (bootops == NULL)
 474  480                                  acpi_reset_system();
 475  481                          efi_reset();
 476  482                  }
 477  483  
 478  484                  /*
 479  485                   * The problem with using stubs is that we can call
 480  486                   * acpi_reset_system only after the kernel is up and running.
 481  487                   *
 482  488                   * We should create a global state to keep track of how far
 483  489                   * up the kernel is but for the time being we will depend on
 484  490                   * bootops. bootops cleared in startup_end().
 485  491                   */
 486  492                  if (bootops == NULL)
 487  493                          acpi_reset_system();
 488  494          }
 489  495  
 490  496          pc_reset();
 491  497  #else
 492  498          if (IN_XPV_PANIC()) {
 493  499                  if (khat_running && bootops == NULL) {
 494  500                          acpi_reset_system();
 495  501                  }
 496  502  
 497  503                  pc_reset();
 498  504          }
 499  505  
 500  506          (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
 501  507          panic("HYPERVISOR_shutdown() failed");
 502  508  #endif
 503  509          /*NOTREACHED*/
 504  510  }
 505  511  
 506  512  /*
 507  513   * Halt the machine and return to the monitor
 508  514   */
 509  515  void
 510  516  halt(char *s)
 511  517  {
 512  518          stop_other_cpus();      /* send stop signal to other CPUs */
 513  519          if (s)
 514  520                  prom_printf("(%s) \n", s);
 515  521          prom_exit_to_mon();
 516  522          /*NOTREACHED*/
 517  523  }
 518  524  
 519  525  /*
 520  526   * Initiate interrupt redistribution.
 521  527   */
 522  528  void
 523  529  i_ddi_intr_redist_all_cpus()
 524  530  {
 525  531  }
 526  532  
 527  533  /*
 528  534   * XXX These probably ought to live somewhere else
 529  535   * XXX They are called from mem.c
 530  536   */
 531  537  
 532  538  /*
 533  539   * Convert page frame number to an OBMEM page frame number
 534  540   * (i.e. put in the type bits -- zero for this implementation)
 535  541   */
 536  542  pfn_t
 537  543  impl_obmem_pfnum(pfn_t pf)
 538  544  {
 539  545          return (pf);
 540  546  }
 541  547  
 542  548  #ifdef  NM_DEBUG
 543  549  int nmi_test = 0;       /* checked in intentry.s during clock int */
 544  550  int nmtest = -1;
 545  551  nmfunc1(int arg, struct regs *rp)
 546  552  {
 547  553          printf("nmi called with arg = %x, regs = %x\n", arg, rp);
 548  554          nmtest += 50;
 549  555          if (arg == nmtest) {
 550  556                  printf("ip = %x\n", rp->r_pc);
 551  557                  return (1);
 552  558          }
 553  559          return (0);
 554  560  }
 555  561  
 556  562  #endif
 557  563  
 558  564  #include <sys/bootsvcs.h>
 559  565  
 560  566  /* Hacked up initialization for initial kernel check out is HERE. */
 561  567  /* The basic steps are: */
 562  568  /*      kernel bootfuncs definition/initialization for KADB */
 563  569  /*      kadb bootfuncs pointer initialization */
 564  570  /*      putchar/getchar (interrupts disabled) */
 565  571  
 566  572  /* kadb bootfuncs pointer initialization */
 567  573  
 568  574  int
 569  575  sysp_getchar()
 570  576  {
 571  577          int i;
 572  578          ulong_t s;
 573  579  
 574  580          if (cons_polledio == NULL) {
 575  581                  /* Uh oh */
 576  582                  prom_printf("getchar called with no console\n");
 577  583                  for (;;)
 578  584                          /* LOOP FOREVER */;
 579  585          }
 580  586  
 581  587          s = clear_int_flag();
 582  588          i = cons_polledio->cons_polledio_getchar(
 583  589              cons_polledio->cons_polledio_argument);
 584  590          restore_int_flag(s);
 585  591          return (i);
 586  592  }
 587  593  
 588  594  void
 589  595  sysp_putchar(int c)
 590  596  {
 591  597          ulong_t s;
 592  598  
 593  599          /*
 594  600           * We have no alternative but to drop the output on the floor.
 595  601           */
 596  602          if (cons_polledio == NULL ||
 597  603              cons_polledio->cons_polledio_putchar == NULL)
 598  604                  return;
 599  605  
 600  606          s = clear_int_flag();
 601  607          cons_polledio->cons_polledio_putchar(
 602  608              cons_polledio->cons_polledio_argument, c);
 603  609          restore_int_flag(s);
 604  610  }
 605  611  
 606  612  int
 607  613  sysp_ischar()
 608  614  {
 609  615          int i;
 610  616          ulong_t s;
 611  617  
 612  618          if (cons_polledio == NULL ||
 613  619              cons_polledio->cons_polledio_ischar == NULL)
 614  620                  return (0);
 615  621  
 616  622          s = clear_int_flag();
 617  623          i = cons_polledio->cons_polledio_ischar(
 618  624              cons_polledio->cons_polledio_argument);
 619  625          restore_int_flag(s);
 620  626          return (i);
 621  627  }
 622  628  
 623  629  int
 624  630  goany(void)
 625  631  {
 626  632          prom_printf("Type any key to continue ");
 627  633          (void) prom_getchar();
 628  634          prom_printf("\n");
 629  635          return (1);
 630  636  }
 631  637  
 632  638  static struct boot_syscalls kern_sysp = {
 633  639          sysp_getchar,   /*      unchar  (*getchar)();   7  */
 634  640          sysp_putchar,   /*      int     (*putchar)();   8  */
 635  641          sysp_ischar,    /*      int     (*ischar)();    9  */
 636  642  };
 637  643  
 638  644  #if defined(__xpv)
 639  645  int using_kern_polledio;
 640  646  #endif
 641  647  
 642  648  void
 643  649  kadb_uses_kernel()
 644  650  {
 645  651          /*
 646  652           * This routine is now totally misnamed, since it does not in fact
 647  653           * control kadb's I/O; it only controls the kernel's prom_* I/O.
 648  654           */
 649  655          sysp = &kern_sysp;
 650  656  #if defined(__xpv)
 651  657          using_kern_polledio = 1;
 652  658  #endif
 653  659  }
 654  660  
 655  661  /*
 656  662   *      the interface to the outside world
 657  663   */
 658  664  
 659  665  /*
 660  666   * poll_port -- wait for a register to achieve a
 661  667   *              specific state.  Arguments are a mask of bits we care about,
 662  668   *              and two sub-masks.  To return normally, all the bits in the
 663  669   *              first sub-mask must be ON, all the bits in the second sub-
 664  670   *              mask must be OFF.  If about seconds pass without the register
 665  671   *              achieving the desired bit configuration, we return 1, else
 666  672   *              0.
 667  673   */
 668  674  int
 669  675  poll_port(ushort_t port, ushort_t mask, ushort_t onbits, ushort_t offbits)
 670  676  {
 671  677          int i;
 672  678          ushort_t maskval;
 673  679  
 674  680          for (i = 500000; i; i--) {
 675  681                  maskval = inb(port) & mask;
 676  682                  if (((maskval & onbits) == onbits) &&
 677  683                      ((maskval & offbits) == 0))
 678  684                          return (0);
 679  685                  drv_usecwait(10);
 680  686          }
 681  687          return (1);
 682  688  }
 683  689  
 684  690  /*
 685  691   * set_idle_cpu is called from idle() when a CPU becomes idle.
 686  692   */
 687  693  /*LINTED: static unused */
 688  694  static uint_t last_idle_cpu;
 689  695  
 690  696  /*ARGSUSED*/
 691  697  void
 692  698  set_idle_cpu(int cpun)
 693  699  {
 694  700          last_idle_cpu = cpun;
 695  701          (*psm_set_idle_cpuf)(cpun);
 696  702  }
 697  703  
 698  704  /*
 699  705   * unset_idle_cpu is called from idle() when a CPU is no longer idle.
 700  706   */
 701  707  /*ARGSUSED*/
 702  708  void
 703  709  unset_idle_cpu(int cpun)
 704  710  {
 705  711          (*psm_unset_idle_cpuf)(cpun);
 706  712  }
 707  713  
 708  714  /*
 709  715   * This routine is almost correct now, but not quite.  It still needs the
 710  716   * equivalent concept of "hres_last_tick", just like on the sparc side.
 711  717   * The idea is to take a snapshot of the hi-res timer while doing the
 712  718   * hrestime_adj updates under hres_lock in locore, so that the small
 713  719   * interval between interrupt assertion and interrupt processing is
 714  720   * accounted for correctly.  Once we have this, the code below should
 715  721   * be modified to subtract off hres_last_tick rather than hrtime_base.
 716  722   *
 717  723   * I'd have done this myself, but I don't have source to all of the
 718  724   * vendor-specific hi-res timer routines (grrr...).  The generic hook I
 719  725   * need is something like "gethrtime_unlocked()", which would be just like
 720  726   * gethrtime() but would assume that you're already holding CLOCK_LOCK().
 721  727   * This is what the GET_HRTIME() macro is for on sparc (although it also
 722  728   * serves the function of making time available without a function call
 723  729   * so you don't take a register window overflow while traps are disabled).
 724  730   */
 725  731  void
 726  732  pc_gethrestime(timestruc_t *tp)
 727  733  {
 728  734          int lock_prev;
 729  735          timestruc_t now;
 730  736          int nslt;               /* nsec since last tick */
 731  737          int adj;                /* amount of adjustment to apply */
 732  738  
 733  739  loop:
 734  740          lock_prev = hres_lock;
 735  741          now = hrestime;
 736  742          nslt = (int)(gethrtime() - hres_last_tick);
 737  743          if (nslt < 0) {
 738  744                  /*
 739  745                   * nslt < 0 means a tick came between sampling
 740  746                   * gethrtime() and hres_last_tick; restart the loop
 741  747                   */
 742  748  
 743  749                  goto loop;
 744  750          }
 745  751          now.tv_nsec += nslt;
 746  752          if (hrestime_adj != 0) {
 747  753                  if (hrestime_adj > 0) {
 748  754                          adj = (nslt >> ADJ_SHIFT);
 749  755                          if (adj > hrestime_adj)
 750  756                                  adj = (int)hrestime_adj;
 751  757                  } else {
 752  758                          adj = -(nslt >> ADJ_SHIFT);
 753  759                          if (adj < hrestime_adj)
 754  760                                  adj = (int)hrestime_adj;
 755  761                  }
 756  762                  now.tv_nsec += adj;
 757  763          }
 758  764          while ((unsigned long)now.tv_nsec >= NANOSEC) {
 759  765  
 760  766                  /*
 761  767                   * We might have a large adjustment or have been in the
 762  768                   * debugger for a long time; take care of (at most) four
 763  769                   * of those missed seconds (tv_nsec is 32 bits, so
 764  770                   * anything >4s will be wrapping around).  However,
 765  771                   * anything more than 2 seconds out of sync will trigger
 766  772                   * timedelta from clock() to go correct the time anyway,
 767  773                   * so do what we can, and let the big crowbar do the
 768  774                   * rest.  A similar correction while loop exists inside
 769  775                   * hres_tick(); in all cases we'd like tv_nsec to
 770  776                   * satisfy 0 <= tv_nsec < NANOSEC to avoid confusing
 771  777                   * user processes, but if tv_sec's a little behind for a
 772  778                   * little while, that's OK; time still monotonically
 773  779                   * increases.
 774  780                   */
 775  781  
 776  782                  now.tv_nsec -= NANOSEC;
 777  783                  now.tv_sec++;
 778  784          }
 779  785          if ((hres_lock & ~1) != lock_prev)
 780  786                  goto loop;
 781  787  
 782  788          *tp = now;
 783  789  }
 784  790  
 785  791  void
 786  792  gethrestime_lasttick(timespec_t *tp)
 787  793  {
 788  794          int s;
 789  795  
 790  796          s = hr_clock_lock();
 791  797          *tp = hrestime;
 792  798          hr_clock_unlock(s);
 793  799  }
 794  800  
 795  801  time_t
 796  802  gethrestime_sec(void)
 797  803  {
 798  804          timestruc_t now;
 799  805  
 800  806          gethrestime(&now);
 801  807          return (now.tv_sec);
 802  808  }
 803  809  
 804  810  /*
 805  811   * Initialize a kernel thread's stack
 806  812   */
 807  813  
 808  814  caddr_t
 809  815  thread_stk_init(caddr_t stk)
 810  816  {
 811  817          ASSERT(((uintptr_t)stk & (STACK_ALIGN - 1)) == 0);
 812  818          return (stk - SA(MINFRAME));
 813  819  }
 814  820  
 815  821  /*
 816  822   * Initialize lwp's kernel stack.
 817  823   */
 818  824  
 819  825  #ifdef TRAPTRACE
 820  826  /*
 821  827   * There's a tricky interdependency here between use of sysenter and
 822  828   * TRAPTRACE which needs recording to avoid future confusion (this is
 823  829   * about the third time I've re-figured this out ..)
 824  830   *
 825  831   * Here's how debugging lcall works with TRAPTRACE.
 826  832   *
 827  833   * 1 We're in userland with a breakpoint on the lcall instruction.
 828  834   * 2 We execute the instruction - the instruction pushes the userland
 829  835   *   %ss, %esp, %efl, %cs, %eip on the stack and zips into the kernel
 830  836   *   via the call gate.
 831  837   * 3 The hardware raises a debug trap in kernel mode, the hardware
 832  838   *   pushes %efl, %cs, %eip and gets to dbgtrap via the idt.
 833  839   * 4 dbgtrap pushes the error code and trapno and calls cmntrap
 834  840   * 5 cmntrap finishes building a trap frame
 835  841   * 6 The TRACE_REGS macros in cmntrap copy a REGSIZE worth chunk
 836  842   *   off the stack into the traptrace buffer.
 837  843   *
 838  844   * This means that the traptrace buffer contains the wrong values in
 839  845   * %esp and %ss, but everything else in there is correct.
 840  846   *
 841  847   * Here's how debugging sysenter works with TRAPTRACE.
 842  848   *
 843  849   * a We're in userland with a breakpoint on the sysenter instruction.
 844  850   * b We execute the instruction - the instruction pushes -nothing-
 845  851   *   on the stack, but sets %cs, %eip, %ss, %esp to prearranged
 846  852   *   values to take us to sys_sysenter, at the top of the lwp's
 847  853   *   stack.
 848  854   * c goto 3
 849  855   *
 850  856   * At this point, because we got into the kernel without the requisite
 851  857   * five pushes on the stack, if we didn't make extra room, we'd
 852  858   * end up with the TRACE_REGS macro fetching the saved %ss and %esp
 853  859   * values from negative (unmapped) stack addresses -- which really bites.
 854  860   * That's why we do the '-= 8' below.
 855  861   *
 856  862   * XXX  Note that reading "up" lwp0's stack works because t0 is declared
 857  863   *      right next to t0stack in locore.s
 858  864   */
 859  865  #endif
 860  866  
 861  867  caddr_t
 862  868  lwp_stk_init(klwp_t *lwp, caddr_t stk)
 863  869  {
 864  870          caddr_t oldstk;
 865  871          struct pcb *pcb = &lwp->lwp_pcb;
 866  872  
 867  873          oldstk = stk;
 868  874          stk -= SA(sizeof (struct regs) + SA(MINFRAME));
 869  875  #ifdef TRAPTRACE
 870  876          stk -= 2 * sizeof (greg_t); /* space for phony %ss:%sp (see above) */
 871  877  #endif
 872  878          stk = (caddr_t)((uintptr_t)stk & ~(STACK_ALIGN - 1ul));
 873  879          bzero(stk, oldstk - stk);
 874  880          lwp->lwp_regs = (void *)(stk + SA(MINFRAME));
 875  881  
 876  882          /*
 877  883           * Arrange that the virtualized %fs and %gs GDT descriptors
 878  884           * have a well-defined initial state (present, ring 3
 879  885           * and of type data).
 880  886           */
 881  887  #if defined(__amd64)
 882  888          if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
 883  889                  pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_udesc;
 884  890          else
 885  891                  pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_u32desc;
 886  892  #elif defined(__i386)
 887  893          pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_udesc;
 888  894  #endif  /* __i386 */
 889  895          lwp_installctx(lwp);
 890  896          return (stk);
 891  897  }
 892  898  
 893  899  /*
 894  900   * Use this opportunity to free any dynamically allocated fp storage.
 895  901   */
 896  902  void
 897  903  lwp_stk_fini(klwp_t *lwp)
 898  904  {
 899  905          fp_lwp_cleanup(lwp);
 900  906  }
 901  907  
 902  908  void
 903  909  lwp_fp_init(klwp_t *lwp)
 904  910  {
 905  911          fp_lwp_init(lwp);
 906  912  }
 907  913  
 908  914  /*
 909  915   * If we're not the panic CPU, we wait in panic_idle for reboot.
 910  916   */
 911  917  void
 912  918  panic_idle(void)
 913  919  {
 914  920          splx(ipltospl(CLOCK_LEVEL));
 915  921          (void) setjmp(&curthread->t_pcb);
 916  922  
 917  923          dumpsys_helper();
 918  924  
 919  925  #ifndef __xpv
 920  926          for (;;)
 921  927                  i86_halt();
 922  928  #else
 923  929          for (;;)
 924  930                  ;
 925  931  #endif
 926  932  }
 927  933  
 928  934  /*
 929  935   * Stop the other CPUs by cross-calling them and forcing them to enter
 930  936   * the panic_idle() loop above.
 931  937   */
 932  938  /*ARGSUSED*/
 933  939  void
 934  940  panic_stopcpus(cpu_t *cp, kthread_t *t, int spl)
 935  941  {
 936  942          processorid_t i;
 937  943          cpuset_t xcset;
 938  944  
 939  945          /*
 940  946           * In the case of a Xen panic, the hypervisor has already stopped
 941  947           * all of the CPUs.
 942  948           */
 943  949          if (!IN_XPV_PANIC()) {
 944  950                  (void) splzs();
 945  951  
 946  952                  CPUSET_ALL_BUT(xcset, cp->cpu_id);
 947  953                  xc_priority(0, 0, 0, CPUSET2BV(xcset), (xc_func_t)panic_idle);
 948  954          }
 949  955  
 950  956          for (i = 0; i < NCPU; i++) {
 951  957                  if (i != cp->cpu_id && cpu[i] != NULL &&
 952  958                      (cpu[i]->cpu_flags & CPU_EXISTS))
 953  959                          cpu[i]->cpu_flags |= CPU_QUIESCED;
 954  960          }
 955  961  }
 956  962  
 957  963  /*
 958  964   * Platform callback following each entry to panicsys().
 959  965   */
 960  966  /*ARGSUSED*/
 961  967  void
 962  968  panic_enter_hw(int spl)
 963  969  {
 964  970          /* Nothing to do here */
 965  971  }
 966  972  
 967  973  /*
 968  974   * Platform-specific code to execute after panicstr is set: we invoke
 969  975   * the PSM entry point to indicate that a panic has occurred.
 970  976   */
 971  977  /*ARGSUSED*/
 972  978  void
 973  979  panic_quiesce_hw(panic_data_t *pdp)
 974  980  {
 975  981          psm_notifyf(PSM_PANIC_ENTER);
 976  982  
 977  983          cmi_panic_callback();
 978  984  
 979  985  #ifdef  TRAPTRACE
 980  986          /*
 981  987           * Turn off TRAPTRACE
 982  988           */
 983  989          TRAPTRACE_FREEZE;
 984  990  #endif  /* TRAPTRACE */
 985  991  }
 986  992  
 987  993  /*
 988  994   * Platform callback prior to writing crash dump.
 989  995   */
 990  996  /*ARGSUSED*/
 991  997  void
 992  998  panic_dump_hw(int spl)
 993  999  {
 994 1000          /* Nothing to do here */
 995 1001  }
 996 1002  
 997 1003  void *
 998 1004  plat_traceback(void *fpreg)
 999 1005  {
1000 1006  #ifdef __xpv
1001 1007          if (IN_XPV_PANIC())
1002 1008                  return (xpv_traceback(fpreg));
1003 1009  #endif
1004 1010          return (fpreg);
1005 1011  }
1006 1012  
1007 1013  /*ARGSUSED*/
1008 1014  void
1009 1015  plat_tod_fault(enum tod_fault_type tod_bad)
1010 1016  {}
1011 1017  
1012 1018  /*ARGSUSED*/
1013 1019  int
1014 1020  blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class)
1015 1021  {
1016 1022          return (ENOTSUP);
1017 1023  }
1018 1024  
1019 1025  /*
1020 1026   * The underlying console output routines are protected by raising IPL in case
1021 1027   * we are still calling into the early boot services.  Once we start calling
1022 1028   * the kernel console emulator, it will disable interrupts completely during
1023 1029   * character rendering (see sysp_putchar, for example).  Refer to the comments
1024 1030   * and code in common/os/console.c for more information on these callbacks.
1025 1031   */
1026 1032  /*ARGSUSED*/
1027 1033  int
1028 1034  console_enter(int busy)
1029 1035  {
1030 1036          return (splzs());
1031 1037  }
1032 1038  
1033 1039  /*ARGSUSED*/
1034 1040  void
1035 1041  console_exit(int busy, int spl)
1036 1042  {
1037 1043          splx(spl);
1038 1044  }
1039 1045  
1040 1046  /*
1041 1047   * Allocate a region of virtual address space, unmapped.
1042 1048   * Stubbed out except on sparc, at least for now.
1043 1049   */
1044 1050  /*ARGSUSED*/
1045 1051  void *
1046 1052  boot_virt_alloc(void *addr, size_t size)
1047 1053  {
1048 1054          return (addr);
1049 1055  }
1050 1056  
1051 1057  volatile unsigned long  tenmicrodata;
1052 1058  
1053 1059  void
1054 1060  tenmicrosec(void)
1055 1061  {
1056 1062          extern int gethrtime_hires;
1057 1063  
1058 1064          if (gethrtime_hires) {
1059 1065                  hrtime_t start, end;
1060 1066                  start = end =  gethrtime();
1061 1067                  while ((end - start) < (10 * (NANOSEC / MICROSEC))) {
1062 1068                          SMT_PAUSE();
1063 1069                          end = gethrtime();
1064 1070                  }
1065 1071          } else {
1066 1072  #if defined(__xpv)
1067 1073                  hrtime_t newtime;
1068 1074  
1069 1075                  newtime = xpv_gethrtime() + 10000; /* now + 10 us */
1070 1076                  while (xpv_gethrtime() < newtime)
1071 1077                          SMT_PAUSE();
1072 1078  #else   /* __xpv */
1073 1079                  int i;
1074 1080  
1075 1081                  /*
1076 1082                   * Artificial loop to induce delay.
1077 1083                   */
1078 1084                  for (i = 0; i < microdata; i++)
1079 1085                          tenmicrodata = microdata;
1080 1086  #endif  /* __xpv */
1081 1087          }
1082 1088  }
1083 1089  
1084 1090  /*
1085 1091   * get_cpu_mstate() is passed an array of timestamps, NCMSTATES
1086 1092   * long, and it fills in the array with the time spent on cpu in
1087 1093   * each of the mstates, where time is returned in nsec.
1088 1094   *
1089 1095   * No guarantee is made that the returned values in times[] will
1090 1096   * monotonically increase on sequential calls, although this will
1091 1097   * be true in the long run. Any such guarantee must be handled by
1092 1098   * the caller, if needed. This can happen if we fail to account
1093 1099   * for elapsed time due to a generation counter conflict, yet we
1094 1100   * did account for it on a prior call (see below).
1095 1101   *
1096 1102   * The complication is that the cpu in question may be updating
1097 1103   * its microstate at the same time that we are reading it.
1098 1104   * Because the microstate is only updated when the CPU's state
1099 1105   * changes, the values in cpu_intracct[] can be indefinitely out
1100 1106   * of date. To determine true current values, it is necessary to
1101 1107   * compare the current time with cpu_mstate_start, and add the
1102 1108   * difference to times[cpu_mstate].
1103 1109   *
1104 1110   * This can be a problem if those values are changing out from
1105 1111   * under us. Because the code path in new_cpu_mstate() is
1106 1112   * performance critical, we have not added a lock to it. Instead,
1107 1113   * we have added a generation counter. Before beginning
1108 1114   * modifications, the counter is set to 0. After modifications,
1109 1115   * it is set to the old value plus one.
1110 1116   *
1111 1117   * get_cpu_mstate() will not consider the values of cpu_mstate
1112 1118   * and cpu_mstate_start to be usable unless the value of
1113 1119   * cpu_mstate_gen is both non-zero and unchanged, both before and
1114 1120   * after reading the mstate information. Note that we must
1115 1121   * protect against out-of-order loads around accesses to the
1116 1122   * generation counter. Also, this is a best effort approach in
1117 1123   * that we do not retry should the counter be found to have
1118 1124   * changed.
1119 1125   *
1120 1126   * cpu_intracct[] is used to identify time spent in each CPU
1121 1127   * mstate while handling interrupts. Such time should be reported
1122 1128   * against system time, and so is subtracted out from its
1123 1129   * corresponding cpu_acct[] time and added to
1124 1130   * cpu_acct[CMS_SYSTEM].
1125 1131   */
1126 1132  
1127 1133  void
1128 1134  get_cpu_mstate(cpu_t *cpu, hrtime_t *times)
1129 1135  {
1130 1136          int i;
1131 1137          hrtime_t now, start;
1132 1138          uint16_t gen;
1133 1139          uint16_t state;
1134 1140          hrtime_t intracct[NCMSTATES];
1135 1141  
1136 1142          /*
1137 1143           * Load all volatile state under the protection of membar.
1138 1144           * cpu_acct[cpu_mstate] must be loaded to avoid double counting
1139 1145           * of (now - cpu_mstate_start) by a change in CPU mstate that
1140 1146           * arrives after we make our last check of cpu_mstate_gen.
1141 1147           */
1142 1148  
1143 1149          now = gethrtime_unscaled();
1144 1150          gen = cpu->cpu_mstate_gen;
1145 1151  
1146 1152          membar_consumer();      /* guarantee load ordering */
1147 1153          start = cpu->cpu_mstate_start;
1148 1154          state = cpu->cpu_mstate;
1149 1155          for (i = 0; i < NCMSTATES; i++) {
1150 1156                  intracct[i] = cpu->cpu_intracct[i];
1151 1157                  times[i] = cpu->cpu_acct[i];
1152 1158          }
1153 1159          membar_consumer();      /* guarantee load ordering */
1154 1160  
1155 1161          if (gen != 0 && gen == cpu->cpu_mstate_gen && now > start)
1156 1162                  times[state] += now - start;
1157 1163  
1158 1164          for (i = 0; i < NCMSTATES; i++) {
1159 1165                  if (i == CMS_SYSTEM)
1160 1166                          continue;
1161 1167                  times[i] -= intracct[i];
1162 1168                  if (times[i] < 0) {
1163 1169                          intracct[i] += times[i];
1164 1170                          times[i] = 0;
1165 1171                  }
1166 1172                  times[CMS_SYSTEM] += intracct[i];
1167 1173                  scalehrtime(&times[i]);
1168 1174          }
1169 1175          scalehrtime(&times[CMS_SYSTEM]);
1170 1176  }
1171 1177  
1172 1178  /*
1173 1179   * This is a version of the rdmsr instruction that allows
1174 1180   * an error code to be returned in the case of failure.
1175 1181   */
1176 1182  int
1177 1183  checked_rdmsr(uint_t msr, uint64_t *value)
1178 1184  {
1179 1185          if (!is_x86_feature(x86_featureset, X86FSET_MSR))
1180 1186                  return (ENOTSUP);
1181 1187          *value = rdmsr(msr);
1182 1188          return (0);
1183 1189  }
1184 1190  
1185 1191  /*
1186 1192   * This is a version of the wrmsr instruction that allows
1187 1193   * an error code to be returned in the case of failure.
1188 1194   */
1189 1195  int
1190 1196  checked_wrmsr(uint_t msr, uint64_t value)
1191 1197  {
1192 1198          if (!is_x86_feature(x86_featureset, X86FSET_MSR))
1193 1199                  return (ENOTSUP);
1194 1200          wrmsr(msr, value);
1195 1201          return (0);
1196 1202  }
1197 1203  
1198 1204  /*
1199 1205   * The mem driver's usual method of using hat_devload() to establish a
1200 1206   * temporary mapping will not work for foreign pages mapped into this
1201 1207   * domain or for the special hypervisor-provided pages.  For the foreign
1202 1208   * pages, we often don't know which domain owns them, so we can't ask the
1203 1209   * hypervisor to set up a new mapping.  For the other pages, we don't have
1204 1210   * a pfn, so we can't create a new PTE.  For these special cases, we do a
1205 1211   * direct uiomove() from the existing kernel virtual address.
1206 1212   */
1207 1213  /*ARGSUSED*/
1208 1214  int
1209 1215  plat_mem_do_mmio(struct uio *uio, enum uio_rw rw)
1210 1216  {
1211 1217  #if defined(__xpv)
1212 1218          void *va = (void *)(uintptr_t)uio->uio_loffset;
1213 1219          off_t pageoff = uio->uio_loffset & PAGEOFFSET;
1214 1220          size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
1215 1221              (size_t)uio->uio_iov->iov_len);
1216 1222  
1217 1223          if ((rw == UIO_READ &&
1218 1224              (va == HYPERVISOR_shared_info || va == xen_info)) ||
1219 1225              (pfn_is_foreign(hat_getpfnum(kas.a_hat, va))))
1220 1226                  return (uiomove(va, nbytes, rw, uio));
1221 1227  #endif
1222 1228          return (ENOTSUP);
1223 1229  }
1224 1230  
1225 1231  pgcnt_t
1226 1232  num_phys_pages()
1227 1233  {
1228 1234          pgcnt_t npages = 0;
1229 1235          struct memlist *mp;
1230 1236  
1231 1237  #if defined(__xpv)
1232 1238          if (DOMAIN_IS_INITDOMAIN(xen_info))
1233 1239                  return (xpv_nr_phys_pages());
1234 1240  #endif /* __xpv */
1235 1241  
1236 1242          for (mp = phys_install; mp != NULL; mp = mp->ml_next)
1237 1243                  npages += mp->ml_size >> PAGESHIFT;
1238 1244  
1239 1245          return (npages);
1240 1246  }
1241 1247  
1242 1248  /* cpu threshold for compressed dumps */
1243 1249  #ifdef _LP64
1244 1250  uint_t dump_plat_mincpu_default = DUMP_PLAT_X86_64_MINCPU;
1245 1251  #else
1246 1252  uint_t dump_plat_mincpu_default = DUMP_PLAT_X86_32_MINCPU;
1247 1253  #endif
1248 1254  
1249 1255  int
1250 1256  dump_plat_addr()
1251 1257  {
1252 1258  #ifdef __xpv
1253 1259          pfn_t pfn = mmu_btop(xen_info->shared_info) | PFN_IS_FOREIGN_MFN;
1254 1260          mem_vtop_t mem_vtop;
1255 1261          int cnt;
1256 1262  
1257 1263          /*
1258 1264           * On the hypervisor, we want to dump the page with shared_info on it.
1259 1265           */
1260 1266          if (!IN_XPV_PANIC()) {
1261 1267                  mem_vtop.m_as = &kas;
1262 1268                  mem_vtop.m_va = HYPERVISOR_shared_info;
1263 1269                  mem_vtop.m_pfn = pfn;
1264 1270                  dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1265 1271                  cnt = 1;
1266 1272          } else {
1267 1273                  cnt = dump_xpv_addr();
1268 1274          }
1269 1275          return (cnt);
1270 1276  #else
1271 1277          return (0);
1272 1278  #endif
1273 1279  }
1274 1280  
1275 1281  void
1276 1282  dump_plat_pfn()
1277 1283  {
1278 1284  #ifdef __xpv
1279 1285          pfn_t pfn = mmu_btop(xen_info->shared_info) | PFN_IS_FOREIGN_MFN;
1280 1286  
1281 1287          if (!IN_XPV_PANIC())
1282 1288                  dumpvp_write(&pfn, sizeof (pfn));
1283 1289          else
1284 1290                  dump_xpv_pfn();
1285 1291  #endif
1286 1292  }
1287 1293  
1288 1294  /*ARGSUSED*/
1289 1295  int
1290 1296  dump_plat_data(void *dump_cbuf)
1291 1297  {
1292 1298  #ifdef __xpv
1293 1299          uint32_t csize;
1294 1300          int cnt;
1295 1301  
1296 1302          if (!IN_XPV_PANIC()) {
1297 1303                  csize = (uint32_t)compress(HYPERVISOR_shared_info, dump_cbuf,
1298 1304                      PAGESIZE);
1299 1305                  dumpvp_write(&csize, sizeof (uint32_t));
1300 1306                  dumpvp_write(dump_cbuf, csize);
1301 1307                  cnt = 1;
1302 1308          } else {
1303 1309                  cnt = dump_xpv_data(dump_cbuf);
1304 1310          }
1305 1311          return (cnt);
1306 1312  #else
1307 1313          return (0);
1308 1314  #endif
1309 1315  }
1310 1316  
1311 1317  /*
1312 1318   * Calculates a linear address, given the CS selector and PC values
1313 1319   * by looking up the %cs selector process's LDT or the CPU's GDT.
1314 1320   * proc->p_ldtlock must be held across this call.
1315 1321   */
1316 1322  int
1317 1323  linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
1318 1324  {
1319 1325          user_desc_t     *descrp;
1320 1326          caddr_t         baseaddr;
1321 1327          uint16_t        idx = SELTOIDX(rp->r_cs);
1322 1328  
1323 1329          ASSERT(rp->r_cs <= 0xFFFF);
1324 1330          ASSERT(MUTEX_HELD(&p->p_ldtlock));
1325 1331  
1326 1332          if (SELISLDT(rp->r_cs)) {
1327 1333                  /*
1328 1334                   * Currently 64 bit processes cannot have private LDTs.
1329 1335                   */
1330 1336                  ASSERT(p->p_model != DATAMODEL_LP64);
1331 1337  
1332 1338                  if (p->p_ldt == NULL)
1333 1339                          return (-1);
1334 1340  
1335 1341                  descrp = &p->p_ldt[idx];
1336 1342                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1337 1343  
1338 1344                  /*
1339 1345                   * Calculate the linear address (wraparound is not only ok,
1340 1346                   * it's expected behavior).  The cast to uint32_t is because
1341 1347                   * LDT selectors are only allowed in 32-bit processes.
1342 1348                   */
1343 1349                  *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
1344 1350                      rp->r_pc);
1345 1351          } else {
1346 1352  #ifdef DEBUG
1347 1353                  descrp = &CPU->cpu_gdt[idx];
1348 1354                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1349 1355                  /* GDT-based descriptors' base addresses should always be 0 */
1350 1356                  ASSERT(baseaddr == 0);
1351 1357  #endif
1352 1358                  *linearp = (caddr_t)(uintptr_t)rp->r_pc;
1353 1359          }
1354 1360  
1355 1361          return (0);
1356 1362  }
1357 1363  
1358 1364  /*
1359 1365   * The implementation of dtrace_linear_pc is similar to the that of
1360 1366   * linear_pc, above, but here we acquire p_ldtlock before accessing
1361 1367   * p_ldt.  This implementation is used by the pid provider; we prefix
1362 1368   * it with "dtrace_" to avoid inducing spurious tracing events.
1363 1369   */
1364 1370  int
1365 1371  dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
1366 1372  {
1367 1373          user_desc_t     *descrp;
1368 1374          caddr_t         baseaddr;
1369 1375          uint16_t        idx = SELTOIDX(rp->r_cs);
1370 1376  
1371 1377          ASSERT(rp->r_cs <= 0xFFFF);
1372 1378  
1373 1379          if (SELISLDT(rp->r_cs)) {
1374 1380                  /*
1375 1381                   * Currently 64 bit processes cannot have private LDTs.
1376 1382                   */
1377 1383                  ASSERT(p->p_model != DATAMODEL_LP64);
1378 1384  
1379 1385                  mutex_enter(&p->p_ldtlock);
1380 1386                  if (p->p_ldt == NULL) {
1381 1387                          mutex_exit(&p->p_ldtlock);
1382 1388                          return (-1);
1383 1389                  }
1384 1390                  descrp = &p->p_ldt[idx];
1385 1391                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1386 1392                  mutex_exit(&p->p_ldtlock);
1387 1393  
1388 1394                  /*
1389 1395                   * Calculate the linear address (wraparound is not only ok,
1390 1396                   * it's expected behavior).  The cast to uint32_t is because
1391 1397                   * LDT selectors are only allowed in 32-bit processes.
1392 1398                   */
1393 1399                  *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
1394 1400                      rp->r_pc);
1395 1401          } else {
1396 1402  #ifdef DEBUG
1397 1403                  descrp = &CPU->cpu_gdt[idx];
1398 1404                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1399 1405                  /* GDT-based descriptors' base addresses should always be 0 */
1400 1406                  ASSERT(baseaddr == 0);
1401 1407  #endif
1402 1408                  *linearp = (caddr_t)(uintptr_t)rp->r_pc;
1403 1409          }
1404 1410  
1405 1411          return (0);
1406 1412  }
1407 1413  
1408 1414  /*
1409 1415   * We need to post a soft interrupt to reprogram the lbolt cyclic when
1410 1416   * switching from event to cyclic driven lbolt. The following code adds
1411 1417   * and posts the softint for x86.
1412 1418   */
1413 1419  static ddi_softint_hdl_impl_t lbolt_softint_hdl =
1414 1420          {0, 0, NULL, NULL, 0, NULL, NULL, NULL};
1415 1421  
1416 1422  void
1417 1423  lbolt_softint_add(void)
1418 1424  {
1419 1425          (void) add_avsoftintr((void *)&lbolt_softint_hdl, LOCK_LEVEL,
1420 1426              (avfunc)lbolt_ev_to_cyclic, "lbolt_ev_to_cyclic", NULL, NULL);
1421 1427  }
1422 1428  
1423 1429  void
1424 1430  lbolt_softint_post(void)
1425 1431  {
1426 1432          (*setsoftint)(CBE_LOCK_PIL, lbolt_softint_hdl.ih_pending);
1427 1433  }
1428 1434  
1429 1435  boolean_t
1430 1436  plat_dr_check_capability(uint64_t features)
1431 1437  {
1432 1438          return ((plat_dr_options & features) == features);
1433 1439  }
1434 1440  
1435 1441  boolean_t
1436 1442  plat_dr_support_cpu(void)
1437 1443  {
1438 1444          return (plat_dr_options & PLAT_DR_FEATURE_CPU);
1439 1445  }
1440 1446  
1441 1447  boolean_t
1442 1448  plat_dr_support_memory(void)
1443 1449  {
1444 1450          return (plat_dr_options & PLAT_DR_FEATURE_MEMORY);
1445 1451  }
1446 1452

↓ open down ↓

1244 lines elided

↑ open up ↑

1447 1453  void
1448 1454  plat_dr_enable_capability(uint64_t features)
1449 1455  {
1450 1456          atomic_or_64(&plat_dr_options, features);
1451 1457  }
1452 1458  
1453 1459  void
1454 1460  plat_dr_disable_capability(uint64_t features)
1455 1461  {
1456 1462          atomic_and_64(&plat_dr_options, ~features);
     1463 +}
     1464 +
     1465 +/*
     1466 + * If SMAP is supported, look through hi_calls and inline
     1467 + * calls to smap_enable() to clac and smap_disable() to stac.
     1468 + */
     1469 +void
     1470 +hotinline_smap(hotinline_desc_t *hid)
     1471 +{
     1472 +        if (is_x86_feature(x86_featureset, X86FSET_SMAP) == B_FALSE)
     1473 +                return;
     1474 +
     1475 +        if (strcmp(hid->hid_symname, "smap_enable") == 0) {
     1476 +                bcopy(clac_instr, (void *)hid->hid_instr_offset,
     1477 +                    sizeof (clac_instr));
     1478 +        } else if (strcmp(hid->hid_symname, "smap_disable") == 0) {
     1479 +                bcopy(stac_instr, (void *)hid->hid_instr_offset,
     1480 +                    sizeof (stac_instr));
     1481 +        }
     1482 +}
     1483 +
     1484 +/*
     1485 + * Loop through hi_calls and hand off the inlining to
     1486 + * the appropriate calls.
     1487 + */
     1488 +void
     1489 +do_hotinlines(struct module *mp)
     1490 +{
     1491 +        for (hotinline_desc_t *hid = mp->hi_calls; hid != NULL;
     1492 +            hid = hid->hid_next) {
     1493 +#if !defined(__xpv)
     1494 +                hotinline_smap(hid);
     1495 +#endif  /* __xpv */
     1496 +        }
1457 1497  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX